1use std::borrow::Cow;
2use std::collections::VecDeque;
3use std::sync::Arc;
4use std::sync::OnceLock;
5
6use html_inspector::{Attribute, EventSource, InputFormat, ParseEvent, Span, ValidatorError};
7use rustc_hash::FxHashMap;
8
9#[cfg(feature = "html5ever")]
10mod html5ever_rcdom;
11#[cfg(feature = "html5ever")]
12mod html5ever_source;
13mod named_entities;
14#[cfg(feature = "html5ever")]
15pub use html5ever_source::Html5EverEventSource;
16
17#[derive(Clone)]
18pub enum HtmlEventSource {
19 Simple(SimpleHtmlEventSource),
20 #[cfg(feature = "html5ever")]
21 Html5Ever(Html5EverEventSource),
22}
23
24impl HtmlEventSource {
25 pub fn from_bytes(
26 name: impl Into<String>,
27 format: InputFormat,
28 bytes: Vec<u8>,
29 ) -> Result<Self, ValidatorError> {
30 Self::from_shared_bytes(name, format, Arc::new(bytes))
31 }
32
33 pub fn from_shared_bytes(
34 name: impl Into<String>,
35 format: InputFormat,
36 bytes: Arc<Vec<u8>>,
37 ) -> Result<Self, ValidatorError> {
38 let name = name.into();
39
40 #[cfg(feature = "html5ever")]
41 if format == InputFormat::Html {
42 return Ok(HtmlEventSource::Html5Ever(
43 Html5EverEventSource::from_shared_bytes(name, bytes),
44 ));
45 }
46
47 Ok(HtmlEventSource::Simple(
48 SimpleHtmlEventSource::from_shared_bytes(name, format, bytes),
49 ))
50 }
51
52 pub fn from_str(
53 name: impl Into<String>,
54 format: InputFormat,
55 s: &str,
56 ) -> Result<Self, ValidatorError> {
57 Self::from_bytes(name, format, s.as_bytes().to_vec())
58 }
59}
60
61#[cfg(test)]
62mod html_event_source_tests {
63 use super::*;
64
65 #[test]
66 fn html_event_source_selects_backend_by_feature_and_format() {
67 let html = HtmlEventSource::from_str("t", InputFormat::Html, "<p>hi</p>").unwrap();
68 #[cfg(feature = "html5ever")]
69 assert!(matches!(html, HtmlEventSource::Html5Ever(_)));
70 #[cfg(not(feature = "html5ever"))]
71 assert!(matches!(html, HtmlEventSource::Simple(_)));
72
73 let xhtml = HtmlEventSource::from_str("t", InputFormat::Xhtml, "<p/>").unwrap();
74 assert!(matches!(xhtml, HtmlEventSource::Simple(_)));
75 }
76
77 #[test]
78 fn simple_scanner_normalizes_tag_and_attribute_names_only_for_html() {
79 let mut html =
80 HtmlEventSource::from_str("t", InputFormat::Html, "<DiV CLass=foo></DiV>").unwrap();
81 let (name, attrs) = loop {
82 match html.next_event().unwrap() {
83 Some(ParseEvent::StartTag { name, attrs, .. }) if name == "div" => {
84 break (name, attrs);
85 }
86 Some(_) => continue,
87 None => panic!("did not find <div> StartTag"),
88 }
89 };
90 assert_eq!(name, "div");
91 assert_eq!(attrs.len(), 1);
92 assert_eq!(attrs[0].name, "class");
93 assert_eq!(attrs[0].value.as_deref(), Some("foo"));
94
95 let mut xhtml =
96 HtmlEventSource::from_str("t", InputFormat::Xhtml, "<DiV CLass=\"foo\"/>").unwrap();
97 let (name, attrs) = loop {
98 match xhtml.next_event().unwrap() {
99 Some(ParseEvent::StartTag { name, attrs, .. }) if name == "DiV" => {
100 break (name, attrs);
101 }
102 Some(_) => continue,
103 None => panic!("did not find <DiV> StartTag"),
104 }
105 };
106 assert_eq!(name, "DiV");
107 assert_eq!(attrs.len(), 1);
108 assert_eq!(attrs[0].name, "CLass");
109 assert_eq!(attrs[0].value.as_deref(), Some("foo"));
110 }
111
112 #[test]
113 fn simple_scanner_lowercases_ascii_in_non_ascii_attribute_names_for_html_only() {
114 let mut html =
115 HtmlEventSource::from_str("t", InputFormat::Html, "<div ❤A=foo></div>").unwrap();
116 let attrs = loop {
117 match html.next_event().unwrap() {
118 Some(ParseEvent::StartTag { name, attrs, .. }) if name == "div" => break attrs,
119 Some(_) => continue,
120 None => panic!("did not find <div> StartTag"),
121 }
122 };
123 assert_eq!(attrs.len(), 1);
124 assert_eq!(attrs[0].name, "❤a");
125 assert_eq!(attrs[0].value.as_deref(), Some("foo"));
126
127 let mut xhtml =
128 HtmlEventSource::from_str("t", InputFormat::Xhtml, "<div ❤A=\"foo\"/>").unwrap();
129 let attrs = loop {
130 match xhtml.next_event().unwrap() {
131 Some(ParseEvent::StartTag { name, attrs, .. }) if name == "div" => break attrs,
132 Some(_) => continue,
133 None => panic!("did not find <div> StartTag"),
134 }
135 };
136 assert_eq!(attrs.len(), 1);
137 assert_eq!(attrs[0].name, "❤A");
138 assert_eq!(attrs[0].value.as_deref(), Some("foo"));
139 }
140
141 #[test]
142 fn bytes_at_cursor_is_safe_at_eof() {
143 let mut src = SimpleHtmlEventSource::from_str("t", InputFormat::Html, "<p>");
144 assert!(src.bytes_at_cursor(b"<p"));
145
146 src.cursor = src.bytes.len();
147 assert!(!src.bytes_at_cursor(b"<"));
148 assert!(src.bytes_at_cursor(b""));
149 }
150}
151
152impl EventSource for HtmlEventSource {
153 fn source_name(&self) -> &str {
154 match self {
155 HtmlEventSource::Simple(s) => s.source_name(),
156 #[cfg(feature = "html5ever")]
157 HtmlEventSource::Html5Ever(s) => s.source_name(),
158 }
159 }
160
161 fn format(&self) -> InputFormat {
162 match self {
163 HtmlEventSource::Simple(s) => s.format(),
164 #[cfg(feature = "html5ever")]
165 HtmlEventSource::Html5Ever(s) => s.format(),
166 }
167 }
168
169 fn next_event(&mut self) -> Result<Option<ParseEvent>, ValidatorError> {
170 match self {
171 HtmlEventSource::Simple(s) => s.next_event(),
172 #[cfg(feature = "html5ever")]
173 HtmlEventSource::Html5Ever(s) => s.next_event(),
174 }
175 }
176}
177
178#[derive(Clone)]
179pub struct SimpleHtmlEventSource {
180 name: String,
181 format: InputFormat,
182 bytes: Arc<Vec<u8>>,
183 cursor: usize,
184 line: u32,
185 col: u32,
186 open_elements: Vec<String>,
187 open_namespaces: Vec<HtmlNamespace>,
188 pending: VecDeque<ParseEvent>,
189 finished: bool,
190}
191
192#[derive(Clone, Copy, Debug, PartialEq, Eq)]
193enum HtmlNamespace {
194 Html,
195 Svg,
196 Math,
197}
198
199impl SimpleHtmlEventSource {
200 pub fn from_bytes(name: impl Into<String>, format: InputFormat, bytes: Vec<u8>) -> Self {
201 Self::from_shared_bytes(name, format, Arc::new(bytes))
202 }
203
204 pub fn from_shared_bytes(
205 name: impl Into<String>,
206 format: InputFormat,
207 bytes: Arc<Vec<u8>>,
208 ) -> Self {
209 Self {
210 name: name.into(),
211 format,
212 bytes,
213 cursor: 0,
214 line: 1,
215 col: 1,
216 open_elements: Vec::new(),
217 open_namespaces: Vec::new(),
218 pending: VecDeque::new(),
219 finished: false,
220 }
221 }
222
223 pub fn from_str(name: impl Into<String>, format: InputFormat, s: &str) -> Self {
224 Self::from_bytes(name, format, s.as_bytes().to_vec())
225 }
226
227 fn bump_to(&mut self, new_cursor: usize) {
228 for &b in &self.bytes[self.cursor..new_cursor] {
229 if b == b'\n' {
230 self.line += 1;
231 self.col = 1;
232 } else {
233 self.col += 1;
234 }
235 }
236 self.cursor = new_cursor;
237 }
238
239 #[inline]
240 fn bytes_at_cursor(&self, needle: &[u8]) -> bool {
241 self.bytes
242 .get(self.cursor..)
243 .is_some_and(|tail| tail.starts_with(needle))
244 }
245
246 fn current_span(&self, start: usize, end: usize, start_line: u32, start_col: u32) -> Span {
247 Span::new(start, end, start_line, start_col)
248 }
249
250 fn emit_tokenizer_eof_after_lt(&mut self, start: usize, start_line: u32, start_col: u32) {
251 let end = self.bytes.len();
252 self.bump_to(end);
253 self.pending.push_back(ParseEvent::ParseError {
254 code: "html.tokenizer.eof_after_lt".to_string(),
255 message: "End of file after “<”.".to_string(),
256 span: Some(self.current_span(start, end, start_line, start_col)),
257 });
258 self.finished = true;
259 }
260
261 fn normalize_name(&self, s: impl Into<String>) -> String {
262 let mut out = s.into();
263 if self.format == InputFormat::Html {
264 out.make_ascii_lowercase();
265 }
266 out
267 }
268
269 fn current_text_mode_kind(&self) -> TextModeKind {
270 let (Some(name), Some(&HtmlNamespace::Html)) =
271 (self.open_elements.last(), self.open_namespaces.last())
272 else {
273 return TextModeKind::Data;
274 };
275 match name.as_str() {
276 "script" | "style" | "xmp" | "iframe" | "noembed" | "noframes" => TextModeKind::RawText,
277 "title" | "textarea" => TextModeKind::RcData,
278 "plaintext" => TextModeKind::Plaintext,
279 _ => TextModeKind::Data,
280 }
281 }
282
283 fn current_insertion_namespace(&self) -> HtmlNamespace {
284 let ns = self
285 .open_namespaces
286 .last()
287 .copied()
288 .unwrap_or(HtmlNamespace::Html);
289 if ns == HtmlNamespace::Svg
290 && self
291 .open_elements
292 .last()
293 .is_some_and(|name| matches!(name.as_str(), "foreignobject" | "desc" | "title"))
294 {
295 HtmlNamespace::Html
296 } else {
297 ns
298 }
299 }
300
301 fn namespace_for_start_tag(&self, name: &str) -> HtmlNamespace {
302 match self.current_insertion_namespace() {
303 HtmlNamespace::Html => match name {
304 "svg" => HtmlNamespace::Svg,
305 "math" => HtmlNamespace::Math,
306 _ => HtmlNamespace::Html,
307 },
308 ns => ns,
309 }
310 }
311
312 fn scan_next(&mut self) -> Result<(), ValidatorError> {
313 if self.finished {
314 return Ok(());
315 }
316
317 if self.cursor >= self.bytes.len() {
318 self.finished = true;
319 return Ok(());
320 }
321
322 if self.bytes[self.cursor] == b'<' && self.cursor + 1 == self.bytes.len() {
324 let start = self.cursor;
325 let start_line = self.line;
326 let start_col = self.col;
327 self.emit_tokenizer_eof_after_lt(start, start_line, start_col);
328 return Ok(());
329 }
330
331 match self.current_text_mode_kind() {
332 TextModeKind::Data => {}
333 TextModeKind::Plaintext => {
334 let start = self.cursor;
335 let start_line = self.line;
336 let start_col = self.col;
337 let end = self.bytes.len();
338 self.bump_to(end);
339 let text = bytes_to_string_lossy(&self.bytes[start..end]);
340 self.pending.push_back(ParseEvent::Text {
341 text,
342 span: Some(self.current_span(start, end, start_line, start_col)),
343 });
344 self.finished = true;
345 return Ok(());
346 }
347 TextModeKind::RawText => {
348 return self.scan_rawtext(false);
349 }
350 TextModeKind::RcData => {
351 return self.scan_rawtext(true);
352 }
353 }
354
355 let next_lt = memchr(b'<', &self.bytes[self.cursor..]).map(|off| self.cursor + off);
356 if let Some(lt) = next_lt {
357 if lt > self.cursor {
358 let start = self.cursor;
359 let start_line = self.line;
360 let start_col = self.col;
361 self.bump_to(lt);
362 let raw = str_from_bytes_lossy(&self.bytes[start..lt]);
363 let (text, errs) = decode_char_refs_with_errors(
364 self.format,
365 raw.as_ref(),
366 false,
367 start,
368 start_line,
369 start_col,
370 );
371 self.pending.extend(errs);
372 self.pending.push_back(ParseEvent::Text {
373 text,
374 span: Some(self.current_span(start, lt, start_line, start_col)),
375 });
376 return Ok(());
377 }
378
379 let start = self.cursor;
381 let start_line = self.line;
382 let start_col = self.col;
383
384 if self.format == InputFormat::Html && self.bytes_at_cursor(b"<>") {
386 self.bump_to(self.cursor + 2);
388 self.pending.push_back(ParseEvent::ParseError {
389 code: "html.tokenizer.lt_gt".to_string(),
390 message:
391 "Saw “<>”. Probable causes: Unescaped “<” (escape as “<”) or mistyped start tag."
392 .to_string(),
393 span: Some(self.current_span(start, start + 2, start_line, start_col)),
394 });
395 self.pending.push_back(ParseEvent::Text {
396 text: "<>".to_string(),
397 span: Some(self.current_span(start, start + 2, start_line, start_col)),
398 });
399 return Ok(());
400 }
401
402 if self.bytes_at_cursor(b"<!--") {
404 return self.scan_comment(start, start_line, start_col);
405 }
406
407 if self.bytes_at_cursor(b"<![CDATA[") {
409 match self.format {
410 InputFormat::Xhtml => return self.scan_cdata(start, start_line, start_col),
411 InputFormat::Html => {
412 if self.current_insertion_namespace() != HtmlNamespace::Html {
413 return self.scan_cdata(start, start_line, start_col);
414 }
415 }
416 }
417 }
418
419 if self.bytes_at_cursor(b"<!") {
421 let mut j = self.cursor + 2;
422 while j < self.bytes.len() && self.bytes[j].is_ascii_whitespace() {
423 j += 1;
424 }
425 if starts_with_ascii_case_insensitive(&self.bytes[j..], b"doctype") {
426 return self.scan_doctype(start, start_line, start_col);
427 }
428 }
429
430 if self.bytes_at_cursor(b"</") {
432 let Some(b) = self.bytes.get(self.cursor + 2).copied() else {
433 self.emit_tokenizer_eof_after_lt(start, start_line, start_col);
435 return Ok(());
436 };
437
438 if b.is_ascii_alphabetic() {
439 return self.scan_end_tag(start, start_line, start_col);
440 }
441 if b == b'>' {
442 self.bump_to(self.cursor + 3);
444 self.pending.push_back(ParseEvent::ParseError {
445 code: "html.tokenizer.lt_slash_gt".to_string(),
446 message:
447 "Saw “</>”. Probable causes: Unescaped “<” (escape as “<”) or mistyped end tag."
448 .to_string(),
449 span: Some(self.current_span(start, start + 3, start_line, start_col)),
450 });
451 self.pending.push_back(ParseEvent::Text {
452 text: "</>".to_string(),
453 span: Some(self.current_span(start, start + 3, start_line, start_col)),
454 });
455 return Ok(());
456 }
457 if b.is_ascii_whitespace() {
458 return self.scan_garbage_after_lt_slash(start, start_line, start_col);
460 }
461 return self.scan_text_run(start, start_line, start_col);
463 }
464
465 if self.bytes_at_cursor(b"<?") {
467 return match self.format {
468 InputFormat::Xhtml => {
469 self.scan_processing_instruction(start, start_line, start_col)
470 }
471 InputFormat::Html => self.scan_bogus_comment(start, start_line, start_col),
472 };
473 }
474 if self.bytes_at_cursor(b"<!") {
475 return self.scan_bogus_comment(start, start_line, start_col);
476 }
477
478 if let Some(&b) = self.bytes.get(self.cursor + 1) {
480 if b.is_ascii_alphabetic() {
481 return self.scan_start_tag(start, start_line, start_col);
482 }
483 if self.format == InputFormat::Html
484 && !b.is_ascii_whitespace()
485 && !matches!(b, b'!' | b'/' | b'?')
486 {
487 self.pending.push_back(ParseEvent::ParseError {
489 code: "html.tokenizer.bad_char_after_lt".to_string(),
490 message:
491 format!("Bad character “{}” after “<”. Probable cause: Unescaped “<”. Try escaping it as “<”.", b as char),
492 span: Some(self.current_span(start, start + 2, start_line, start_col)),
493 });
494 return self.scan_text_run(start, start_line, start_col);
495 }
496 }
497
498 return self.scan_text_run(start, start_line, start_col);
500 }
501
502 let start = self.cursor;
504 let start_line = self.line;
505 let start_col = self.col;
506 let end = self.bytes.len();
507 self.bump_to(end);
508 let raw = str_from_bytes_lossy(&self.bytes[start..end]);
509 let (text, errs) = decode_char_refs_with_errors(
510 self.format,
511 raw.as_ref(),
512 false,
513 start,
514 start_line,
515 start_col,
516 );
517 self.pending.extend(errs);
518 self.pending.push_back(ParseEvent::Text {
519 text,
520 span: Some(self.current_span(start, end, start_line, start_col)),
521 });
522 Ok(())
523 }
524
525 fn scan_comment(
526 &mut self,
527 start: usize,
528 start_line: u32,
529 start_col: u32,
530 ) -> Result<(), ValidatorError> {
531 let Some(end) = find_subslice(&self.bytes, self.cursor + 4, b"-->") else {
532 self.finished = true;
533 self.pending.push_back(ParseEvent::ParseError {
534 code: "html.tokenizer.eof_in_comment".to_string(),
535 message: "End of file inside comment.".to_string(),
536 span: Some(self.current_span(start, self.bytes.len(), start_line, start_col)),
537 });
538 return Ok(());
539 };
540
541 let comment_start = self.cursor + 4;
542 let comment_end = end;
543 let text = bytes_to_string_lossy(&self.bytes[comment_start..comment_end]);
544 if self.format == InputFormat::Html
545 && let Some(off) = text.find("<!--")
546 {
547 let err_start = comment_start + off;
548 let (line, col) = line_col_at_byte_offset(
549 self.bytes.as_ref(),
550 start,
551 start_line,
552 start_col,
553 err_start,
554 );
555 self.pending.push_back(ParseEvent::ParseError {
556 code: "html.tokenizer.nested_comment".to_string(),
557 message:
558 "Saw “<!--” within a comment. Probable cause: Nested comment (not allowed)."
559 .to_string(),
560 span: Some(Span::new(err_start, err_start + 4, line, col)),
561 });
562 }
563 let close_end = end + 3;
564 self.bump_to(close_end);
565 self.pending.push_back(ParseEvent::Comment {
566 text,
567 span: Some(self.current_span(start, close_end, start_line, start_col)),
568 });
569 Ok(())
570 }
571
572 fn scan_cdata(
573 &mut self,
574 start: usize,
575 start_line: u32,
576 start_col: u32,
577 ) -> Result<(), ValidatorError> {
578 let cdata_start = self.cursor + 9;
580 let Some(end) = find_subslice(&self.bytes, cdata_start, b"]]>") else {
581 self.finished = true;
582 self.pending.push_back(ParseEvent::ParseError {
583 code: "xml.cdata_eof".to_string(),
584 message: "Unterminated CDATA section.".to_string(),
585 span: Some(self.current_span(start, self.bytes.len(), start_line, start_col)),
586 });
587 return Ok(());
588 };
589
590 let text = bytes_to_string_lossy(&self.bytes[cdata_start..end]);
591 let close_end = end + 3;
592 self.bump_to(close_end);
593 self.pending.push_back(ParseEvent::Text {
594 text,
595 span: Some(self.current_span(start, close_end, start_line, start_col)),
596 });
597 Ok(())
598 }
599
600 fn scan_bogus_comment(
601 &mut self,
602 start: usize,
603 start_line: u32,
604 start_col: u32,
605 ) -> Result<(), ValidatorError> {
606 let is_processing_instruction = self.bytes_at_cursor(b"<?");
607 self.pending.push_back(ParseEvent::ParseError {
608 code: if is_processing_instruction {
609 "html.tokenizer.processing_instruction".to_string()
610 } else {
611 "html.tokenizer.bogus_comment".to_string()
612 },
613 message: if is_processing_instruction {
614 "Saw “<?”. Probable cause: Attempt to use an XML processing instruction in HTML. (XML processing instructions are not supported in HTML.)".to_string()
615 } else {
616 "Bogus comment.".to_string()
617 },
618 span: Some(self.current_span(start, start + 2, start_line, start_col)),
619 });
620 let prefix_len = 2;
622 if let Some(off) = memchr(b'>', &self.bytes[self.cursor + prefix_len..]) {
623 let gt = self.cursor + prefix_len + off;
624 let text = bytes_to_string_lossy(&self.bytes[self.cursor + prefix_len..gt]);
625 let end = gt + 1;
626 self.bump_to(end);
627 self.pending.push_back(ParseEvent::Comment {
628 text,
629 span: Some(self.current_span(start, end, start_line, start_col)),
630 });
631 return Ok(());
632 }
633 let text = bytes_to_string_lossy(&self.bytes[self.cursor + prefix_len..]);
635 self.bump_to(self.bytes.len());
636 self.pending.push_back(ParseEvent::Comment {
637 text,
638 span: Some(self.current_span(start, self.bytes.len(), start_line, start_col)),
639 });
640 self.finished = true;
641 Ok(())
642 }
643
644 fn scan_processing_instruction(
645 &mut self,
646 start: usize,
647 start_line: u32,
648 start_col: u32,
649 ) -> Result<(), ValidatorError> {
650 let content_start = self.cursor + 2;
652 let Some(pi_end) = find_subslice(&self.bytes, content_start, b"?>") else {
653 self.finished = true;
654 self.pending.push_back(ParseEvent::ParseError {
655 code: "xml.pi_eof".to_string(),
656 message: "Unterminated processing instruction.".to_string(),
657 span: Some(self.current_span(start, self.bytes.len(), start_line, start_col)),
658 });
659 return Ok(());
660 };
661
662 let mut i = content_start;
663 while i < pi_end && self.bytes[i].is_ascii_whitespace() {
664 i += 1;
665 }
666 let target_start = i;
667 while i < pi_end && !self.bytes[i].is_ascii_whitespace() {
668 i += 1;
669 }
670 let target = bytes_to_string_lossy(&self.bytes[target_start..i]);
671 let data = str_from_bytes_lossy(&self.bytes[i..pi_end])
672 .trim()
673 .to_string();
674
675 let close_end = pi_end + 2;
676 self.bump_to(close_end);
677 self.pending.push_back(ParseEvent::ProcessingInstruction {
678 target,
679 data,
680 span: Some(self.current_span(start, close_end, start_line, start_col)),
681 });
682 Ok(())
683 }
684
685 fn scan_end_tag(
686 &mut self,
687 start: usize,
688 start_line: u32,
689 start_col: u32,
690 ) -> Result<(), ValidatorError> {
691 let Some(off) = memchr(b'>', &self.bytes[self.cursor + 2..]) else {
692 self.finished = true;
693 self.pending.push_back(ParseEvent::ParseError {
694 code: "html.tokenizer.eof_in_end_tag".to_string(),
695 message: "End of file seen when looking for tag name. Ignoring tag.".to_string(),
696 span: Some(self.current_span(start, self.bytes.len(), start_line, start_col)),
697 });
698 return Ok(());
699 };
700 let gt = self.cursor + 2 + off;
701
702 let raw_all = str_from_bytes_lossy(&self.bytes[self.cursor + 2..gt]);
703 let raw_trimmed = raw_all.trim();
704
705 let mut raw = raw_trimmed;
706 if raw.ends_with('/') {
707 self.pending.push_back(ParseEvent::ParseError {
708 code: "html.tokenizer.end_tag_stray_slash".to_string(),
709 message: "Stray “/” at the end of an end tag.".to_string(),
710 span: Some(self.current_span(start, gt + 1, start_line, start_col)),
711 });
712 raw = raw.trim_end_matches('/').trim_end();
713 }
714
715 let mut it = raw.split_whitespace();
716 let name_raw = it.next().unwrap_or("");
717 if it.next().is_some() {
718 self.pending.push_back(ParseEvent::ParseError {
719 code: "html.tokenizer.end_tag_with_attrs".to_string(),
720 message: "End tag had attributes.".to_string(),
721 span: Some(self.current_span(start, gt + 1, start_line, start_col)),
722 });
723 }
724
725 let name = self.normalize_name(name_raw);
726 let end = gt + 1;
727 self.bump_to(end);
728 self.pop_open_element(&name);
729 self.pending.push_back(ParseEvent::EndTag {
730 name,
731 span: Some(self.current_span(start, end, start_line, start_col)),
732 });
733 Ok(())
734 }
735
736 fn scan_start_tag(
737 &mut self,
738 start: usize,
739 start_line: u32,
740 start_col: u32,
741 ) -> Result<(), ValidatorError> {
742 let Some(gt) = find_tag_close(&self.bytes, self.cursor + 1) else {
743 let rest = &self.bytes[self.cursor + 1..];
744 let (code, message) = classify_start_tag_eof(rest);
745 self.finished = true;
746 self.pending.push_back(ParseEvent::ParseError {
747 code,
748 message,
749 span: Some(self.current_span(start, self.bytes.len(), start_line, start_col)),
750 });
751 self.bump_to(self.bytes.len());
752 return Ok(());
753 };
754
755 let inside = str_from_bytes_lossy(&self.bytes[self.cursor + 1..gt]);
756 let end = gt + 1;
757 let (name, attrs, self_closing, errs) =
758 parse_start_tag(self, inside.as_ref(), start, start_line, start_col, end)?;
759 self.pending.extend(errs);
760 self.bump_to(end);
761 let ns = self.namespace_for_start_tag(&name);
762 let pushes = match self.format {
763 InputFormat::Html => {
764 if ns == HtmlNamespace::Html {
765 !html_inspector::is_void_html_element(&name)
766 } else {
767 !self_closing
768 }
769 }
770 InputFormat::Xhtml => !self_closing,
771 };
772 if pushes {
773 self.open_elements.push(name.clone());
774 self.open_namespaces.push(ns);
775 }
776 self.pending.push_back(ParseEvent::StartTag {
777 name,
778 attrs,
779 self_closing,
780 span: Some(self.current_span(start, end, start_line, start_col)),
781 });
782 Ok(())
783 }
784
785 fn scan_doctype(
786 &mut self,
787 start: usize,
788 start_line: u32,
789 start_col: u32,
790 ) -> Result<(), ValidatorError> {
791 let all_bytes: &[u8] = &self.bytes;
792 let mk_span = |byte_start: usize, byte_end: usize| {
793 let (line, col) =
794 line_col_at_byte_offset(all_bytes, start, start_line, start_col, byte_start);
795 Span::new(byte_start, byte_end, line, col)
796 };
797
798 let end = {
800 let mut pos = self.cursor + 2;
801 let mut quote: Option<u8> = None;
802 loop {
803 let Some(&b) = self.bytes.get(pos) else {
804 break self.bytes.len();
805 };
806 if let Some(q) = quote {
807 if b == q {
808 quote = None;
809 }
810 } else if matches!(b, b'"' | b'\'') {
811 quote = Some(b);
812 } else if b == b'>' {
813 break pos + 1;
814 }
815 pos += 1;
816 }
817 };
818
819 let bytes = all_bytes;
821 let mut i = self.cursor + 2;
822
823 while i < end && bytes[i].is_ascii_whitespace() {
825 i += 1;
826 }
827 if starts_with_ascii_case_insensitive(&bytes[i..end], b"doctype") {
828 i += "doctype".len();
829 }
830
831 if i >= end || !bytes[i].is_ascii_whitespace() {
833 self.pending.push_back(ParseEvent::ParseError {
834 code: "html.tokenizer.doctype.missing_space_before_name".to_string(),
835 message: "Missing space before doctype name.".to_string(),
836 span: Some(mk_span(start, start + 2)),
837 });
838 }
839
840 while i < end && bytes[i].is_ascii_whitespace() {
841 i += 1;
842 }
843
844 let name_start = i;
845 while i < end && !bytes[i].is_ascii_whitespace() && bytes[i] != b'>' {
846 i += 1;
847 }
848 let name = if name_start < i {
849 Some(self.normalize_name(str_from_bytes_lossy(&bytes[name_start..i])))
850 } else {
851 None
852 };
853
854 while i < end && bytes[i].is_ascii_whitespace() {
855 i += 1;
856 }
857
858 let mut public_id: Option<String> = None;
859 let mut system_id: Option<String> = None;
860 let mut saw_syntax_error = false;
861 let mut saw_bogus_doctype = false;
862
863 if i < end && bytes[i] != b'>' {
865 if starts_with_ascii_case_insensitive(&bytes[i..end], b"public") {
866 i += "public".len();
867 if i < end && matches!(bytes[i], b'"' | b'\'') {
868 saw_syntax_error = true;
869 self.pending.push_back(ParseEvent::ParseError {
870 code: "html.tokenizer.doctype.no_space_after_public".to_string(),
871 message: "No space between the doctype “PUBLIC” keyword and the quote."
872 .to_string(),
873 span: Some(mk_span(start, start + 2)),
874 });
875 }
876 while i < end && bytes[i].is_ascii_whitespace() {
877 i += 1;
878 }
879 if i >= end || bytes[i] == b'>' {
880 saw_syntax_error = true;
881 self.pending.push_back(ParseEvent::ParseError {
882 code: "html.tokenizer.doctype.expected_public_id".to_string(),
883 message: "Expected a public identifier but the doctype ended.".to_string(),
884 span: Some(mk_span(start, start + 2)),
885 });
886 } else if matches!(bytes[i], b'"' | b'\'') {
887 let q = bytes[i];
888 i += 1;
889 let id_start = i;
890 let mut saw_gt = false;
891 while i < end && bytes[i] != q {
892 if bytes[i] == b'>' && !saw_gt {
893 saw_gt = true;
894 saw_syntax_error = true;
895 self.pending.push_back(ParseEvent::ParseError {
896 code: "html.tokenizer.doctype.gt_in_public_id".to_string(),
897 message: "“>” in public identifier.".to_string(),
898 span: Some(mk_span(i, i + 1)),
899 });
900 }
901 i += 1;
902 }
903 if i >= end {
904 saw_syntax_error = true;
905 self.pending.push_back(ParseEvent::ParseError {
906 code: "html.tokenizer.doctype.eof_in_public_id".to_string(),
907 message: "End of file inside public identifier.".to_string(),
908 span: Some(mk_span(start, end)),
909 });
910 }
911 public_id = Some(bytes_to_string_lossy(&bytes[id_start..i.min(end)]));
912 if i < end && bytes[i] == q {
913 i += 1;
914 }
915
916 let mut had_ws = false;
917 while i < end && bytes[i].is_ascii_whitespace() {
918 had_ws = true;
919 i += 1;
920 }
921 if i < end && (bytes[i] == b'"' || bytes[i] == b'\'') {
922 if !had_ws {
923 saw_syntax_error = true;
924 self.pending.push_back(ParseEvent::ParseError {
925 code: "html.tokenizer.doctype.no_space_between_public_system"
926 .to_string(),
927 message:
928 "No space between the doctype public and system identifiers."
929 .to_string(),
930 span: Some(mk_span(i, i + 1)),
931 });
932 }
933 let q = bytes[i];
934 i += 1;
935 let id_start = i;
936 let mut saw_gt = false;
937 while i < end && bytes[i] != q {
938 if bytes[i] == b'>' && !saw_gt {
939 saw_gt = true;
940 saw_syntax_error = true;
941 self.pending.push_back(ParseEvent::ParseError {
942 code: "html.tokenizer.doctype.gt_in_system_id".to_string(),
943 message: "“>” in system identifier.".to_string(),
944 span: Some(mk_span(i, i + 1)),
945 });
946 }
947 i += 1;
948 }
949 if i >= end {
950 saw_syntax_error = true;
951 self.pending.push_back(ParseEvent::ParseError {
952 code: "html.tokenizer.doctype.eof_in_system_id".to_string(),
953 message: "End of file inside system identifier.".to_string(),
954 span: Some(mk_span(start, end)),
955 });
956 }
957 system_id = Some(bytes_to_string_lossy(&bytes[id_start..i.min(end)]));
958 }
960 }
961 } else if starts_with_ascii_case_insensitive(&bytes[i..end], b"system") {
962 i += "system".len();
963 if i < end && (bytes[i] == b'"' || bytes[i] == b'\'') {
964 saw_syntax_error = true;
965 self.pending.push_back(ParseEvent::ParseError {
966 code: "html.tokenizer.doctype.no_space_after_system".to_string(),
967 message: "No space between the doctype “SYSTEM” keyword and the quote."
968 .to_string(),
969 span: Some(mk_span(start, start + 2)),
970 });
971 }
972 while i < end && bytes[i].is_ascii_whitespace() {
973 i += 1;
974 }
975 if i >= end || bytes[i] == b'>' {
976 saw_syntax_error = true;
977 self.pending.push_back(ParseEvent::ParseError {
978 code: "html.tokenizer.doctype.expected_system_id".to_string(),
979 message: "Expected a system identifier but the doctype ended.".to_string(),
980 span: Some(mk_span(start, start + 2)),
981 });
982 } else if bytes[i] == b'"' || bytes[i] == b'\'' {
983 let q = bytes[i];
984 i += 1;
985 let id_start = i;
986 let mut saw_gt = false;
987 while i < end && bytes[i] != q {
988 if bytes[i] == b'>' && !saw_gt {
989 saw_gt = true;
990 saw_syntax_error = true;
991 self.pending.push_back(ParseEvent::ParseError {
992 code: "html.tokenizer.doctype.gt_in_system_id".to_string(),
993 message: "“>” in system identifier.".to_string(),
994 span: Some(mk_span(i, i + 1)),
995 });
996 }
997 i += 1;
998 }
999 if i >= end {
1000 saw_syntax_error = true;
1001 self.pending.push_back(ParseEvent::ParseError {
1002 code: "html.tokenizer.doctype.eof_in_system_id".to_string(),
1003 message: "End of file inside system identifier.".to_string(),
1004 span: Some(mk_span(start, end)),
1005 });
1006 }
1007 system_id = Some(bytes_to_string_lossy(&bytes[id_start..i.min(end)]));
1008 }
1010 } else {
1011 saw_bogus_doctype = true;
1012 self.pending.push_back(ParseEvent::ParseError {
1013 code: "html.tokenizer.doctype.bogus".to_string(),
1014 message: "Bogus doctype.".to_string(),
1015 span: Some(mk_span(start, start + 2)),
1016 });
1017 }
1018 }
1019
1020 if !saw_syntax_error
1022 && !saw_bogus_doctype
1023 && let Some(n) = name.as_deref()
1024 {
1025 let is_html = n.eq_ignore_ascii_case("html");
1026 if !is_html || public_id.is_some() || system_id.is_some() {
1027 let transitional_public =
1028 public_id.as_deref() == Some("-//W3C//DTD HTML 4.01 Transitional//EN");
1029 let transitional_system =
1030 system_id.as_deref() == Some("http://www.w3.org/TR/html4/loose.dtd");
1031 let msg = if is_html && transitional_public && transitional_system {
1032 "Almost standards mode doctype. Expected “<!DOCTYPE html>”."
1033 } else {
1034 "Obsolete doctype. Expected “<!DOCTYPE html>”."
1035 };
1036 self.pending.push_back(ParseEvent::ParseError {
1037 code: "html.parser.doctype.not_html5".to_string(),
1038 message: msg.to_string(),
1039 span: Some(mk_span(start, start + 2)),
1040 });
1041 }
1042 }
1043
1044 self.bump_to(end);
1045 if end == self.bytes.len() {
1046 self.finished = true;
1047 }
1048 self.pending.push_back(ParseEvent::Doctype {
1049 name,
1050 public_id,
1051 system_id,
1052 span: Some(self.current_span(start, end, start_line, start_col)),
1053 });
1054 Ok(())
1055 }
1056
1057 fn scan_rawtext(&mut self, decode: bool) -> Result<(), ValidatorError> {
1058 let start = self.cursor;
1059 let start_line = self.line;
1060 let start_col = self.col;
1061
1062 let end_tag = self.open_elements.last().map_or("", String::as_str);
1063 let lt = find_rawtext_end_tag(&self.bytes, self.cursor, end_tag, self.format);
1064 if let Some(lt) = lt {
1065 if lt > self.cursor {
1066 self.bump_to(lt);
1067 let raw = bytes_to_string_lossy(&self.bytes[start..lt]);
1068 let text = if decode {
1069 decode_char_refs(self.format, raw, false)
1070 } else {
1071 raw
1072 };
1073 self.pending.push_back(ParseEvent::Text {
1074 text,
1075 span: Some(self.current_span(start, lt, start_line, start_col)),
1076 });
1077 return Ok(());
1078 }
1079 return self.scan_end_tag(start, start_line, start_col);
1081 }
1082
1083 let end = self.bytes.len();
1085 self.bump_to(end);
1086 let raw = bytes_to_string_lossy(&self.bytes[start..end]);
1087 let text = if decode {
1088 decode_char_refs(self.format, raw, false)
1089 } else {
1090 raw
1091 };
1092 self.pending.push_back(ParseEvent::Text {
1093 text,
1094 span: Some(self.current_span(start, end, start_line, start_col)),
1095 });
1096 self.finished = true;
1097 Ok(())
1098 }
1099
1100 fn pop_open_element(&mut self, name: &str) {
1101 let Some(pos) = (match self.format {
1102 InputFormat::Html => self
1103 .open_elements
1104 .iter()
1105 .rposition(|n| n.eq_ignore_ascii_case(name)),
1106 InputFormat::Xhtml => self.open_elements.iter().rposition(|n| n == name),
1107 }) else {
1108 return;
1109 };
1110 self.open_elements.truncate(pos);
1111 self.open_namespaces.truncate(pos);
1112 }
1113
1114 fn scan_text_run(
1115 &mut self,
1116 start: usize,
1117 start_line: u32,
1118 start_col: u32,
1119 ) -> Result<(), ValidatorError> {
1120 let next_lt = memchr(b'<', &self.bytes[self.cursor + 1..])
1123 .map_or(self.bytes.len(), |off| self.cursor + 1 + off);
1124 self.bump_to(next_lt);
1125 let raw = bytes_to_string_lossy(&self.bytes[start..next_lt]);
1126 let text = decode_char_refs(self.format, raw, false);
1127 self.pending.push_back(ParseEvent::Text {
1128 text,
1129 span: Some(self.current_span(start, next_lt, start_line, start_col)),
1130 });
1131 Ok(())
1132 }
1133
1134 fn scan_garbage_after_lt_slash(
1135 &mut self,
1136 start: usize,
1137 start_line: u32,
1138 start_col: u32,
1139 ) -> Result<(), ValidatorError> {
1140 self.pending.push_back(ParseEvent::ParseError {
1142 code: "html.tokenizer.garbage_after_lt_slash".to_string(),
1143 message: "Garbage after “</”.".to_string(),
1144 span: Some(self.current_span(start, start + 2, start_line, start_col)),
1145 });
1146 let Some(off) = memchr(b'>', &self.bytes[self.cursor + 2..]) else {
1147 self.finished = true;
1148 self.bump_to(self.bytes.len());
1149 return Ok(());
1150 };
1151 let gt = self.cursor + 2 + off;
1152
1153 let text = bytes_to_string_lossy(&self.bytes[self.cursor + 2..gt]);
1154 let end = gt + 1;
1155 self.bump_to(end);
1156 self.pending.push_back(ParseEvent::Comment {
1157 text,
1158 span: Some(self.current_span(start, end, start_line, start_col)),
1159 });
1160 Ok(())
1161 }
1162}
1163
1164impl EventSource for SimpleHtmlEventSource {
1165 fn source_name(&self) -> &str {
1166 &self.name
1167 }
1168
1169 fn format(&self) -> InputFormat {
1170 self.format
1171 }
1172
1173 fn next_event(&mut self) -> Result<Option<ParseEvent>, ValidatorError> {
1174 if self.pending.is_empty() && !self.finished {
1175 self.scan_next()?;
1176 }
1177 Ok(self.pending.pop_front())
1178 }
1179}
1180
1181#[derive(Clone, Copy, Debug)]
1182enum TextModeKind {
1183 Data,
1184 RawText,
1185 RcData,
1186 Plaintext,
1187}
1188
1189fn str_from_bytes_lossy(bytes: &[u8]) -> Cow<'_, str> {
1190 String::from_utf8_lossy(bytes)
1191}
1192
1193fn bytes_to_string_lossy(bytes: &[u8]) -> String {
1194 str_from_bytes_lossy(bytes).into_owned()
1195}
1196
1197fn line_col_at_byte_offset(
1198 bytes: &[u8],
1199 base_start: usize,
1200 base_line: u32,
1201 base_col: u32,
1202 target: usize,
1203) -> (u32, u32) {
1204 let mut line = base_line;
1205 let mut col = base_col;
1206 if base_start >= bytes.len() || target <= base_start {
1207 return (line, col);
1208 }
1209 let end = target.min(bytes.len());
1210 if end <= base_start {
1211 return (line, col);
1212 }
1213 for &b in &bytes[base_start..end] {
1214 if b == b'\n' {
1215 line += 1;
1216 col = 1;
1217 } else {
1218 col += 1;
1219 }
1220 }
1221 (line, col)
1222}
1223
1224fn parse_start_tag(
1225 src: &SimpleHtmlEventSource,
1226 inside: &str,
1227 tag_start: usize,
1228 tag_line: u32,
1229 tag_col: u32,
1230 tag_end: usize,
1231) -> Result<(String, Vec<Attribute>, bool, Vec<ParseEvent>), ValidatorError> {
1232 #[inline]
1233 fn skip_ws(bytes: &[u8], i: &mut usize) {
1234 while *i < bytes.len() && bytes[*i].is_ascii_whitespace() {
1235 *i += 1;
1236 }
1237 }
1238
1239 #[inline]
1240 fn push_parse_error(
1241 errs: &mut Vec<ParseEvent>,
1242 span: Option<Span>,
1243 code: &'static str,
1244 message: impl Into<String>,
1245 ) {
1246 errs.push(ParseEvent::ParseError {
1247 code: code.to_string(),
1248 message: message.into(),
1249 span,
1250 });
1251 }
1252
1253 let bytes = inside.as_bytes();
1254 let mut i = 0usize;
1255
1256 let inside_base_start = tag_start.saturating_add(1);
1257 let has_gt = tag_end > 0
1258 && tag_end <= src.bytes.len()
1259 && src.bytes.get(tag_end - 1).copied() == Some(b'>');
1260 let inside_base_end = if has_gt {
1261 tag_end.saturating_sub(1)
1262 } else {
1263 tag_end
1264 };
1265 let can_map_to_source_bytes = inside_base_end <= src.bytes.len()
1266 && inside_base_end.saturating_sub(inside_base_start) == bytes.len();
1267
1268 skip_ws(bytes, &mut i);
1270 let name_start = i;
1271 while i < bytes.len() && is_tag_name_char(bytes[i]) {
1272 i += 1;
1273 }
1274 let name_raw = &inside[name_start..i];
1275 let name = src.normalize_name(name_raw);
1276
1277 let mut attrs: Vec<Attribute> = Vec::new();
1278 let mut errs: Vec<ParseEvent> = Vec::new();
1279 let mut self_closing = false;
1280 let tag_span = Some(Span::new(tag_start, tag_end, tag_line, tag_col));
1281
1282 while i < bytes.len() {
1283 skip_ws(bytes, &mut i);
1284 if i >= bytes.len() {
1285 break;
1286 }
1287 if bytes[i] == b'/' {
1288 let mut j = i + 1;
1290 skip_ws(bytes, &mut j);
1291 if j >= bytes.len() {
1292 self_closing = true;
1293 break;
1294 }
1295 }
1296
1297 if bytes[i] == b'=' {
1299 push_parse_error(
1300 &mut errs,
1301 tag_span,
1302 "html.tokenizer.equals_expecting_attr_name",
1303 "Saw “=” when expecting an attribute name. Probable cause: Attribute name missing.",
1304 );
1305 i += 1;
1306 continue;
1307 }
1308 if bytes[i] == b'<' {
1309 push_parse_error(
1310 &mut errs,
1311 tag_span,
1312 "html.tokenizer.lt_expecting_attr_name",
1313 "Saw “<” when expecting an attribute name. Probable cause: Missing “>” immediately before.",
1314 );
1315 i += 1;
1316 continue;
1317 }
1318
1319 let attr_name_start = i;
1320 while i < bytes.len() && !bytes[i].is_ascii_whitespace() && bytes[i] != b'=' {
1321 if bytes[i] == b'/' {
1322 break;
1323 }
1324 i += 1;
1325 }
1326 if i == attr_name_start {
1327 break;
1329 }
1330 let attr_name_raw = &inside[attr_name_start..i];
1331 let attr_name = src.normalize_name(attr_name_raw);
1332 if attr_name_raw.contains('"') {
1333 push_parse_error(
1334 &mut errs,
1335 tag_span,
1336 "html.tokenizer.quote_in_attr_name",
1337 "Quote “\"” in attribute name. Probable cause: Matching quote missing somewhere earlier.",
1338 );
1339 }
1340 if attr_name_raw.contains('<') {
1341 push_parse_error(
1342 &mut errs,
1343 tag_span,
1344 "html.tokenizer.lt_in_attr_name",
1345 "“<” in attribute name. Probable cause: “>” missing immediately before.",
1346 );
1347 }
1348 if attrs.iter().any(|a| a.name == attr_name) {
1349 push_parse_error(
1350 &mut errs,
1351 tag_span,
1352 "html.tokenizer.duplicate_attribute",
1353 format!("Duplicate attribute “{attr_name}”."),
1354 );
1355 }
1356
1357 skip_ws(bytes, &mut i);
1358
1359 let mut value: Option<String> = None;
1360 if i < bytes.len() && bytes[i] == b'=' {
1361 i += 1;
1362 skip_ws(bytes, &mut i);
1363 if i >= bytes.len() || bytes[i] == b'>' {
1364 push_parse_error(
1365 &mut errs,
1366 tag_span,
1367 "html.tokenizer.attr_value_missing",
1368 "Attribute value missing.",
1369 );
1370 }
1371 if i < bytes.len() && (bytes[i] == b'"' || bytes[i] == b'\'') {
1372 let quote = bytes[i];
1373 i += 1;
1374 let value_start = i;
1375 while i < bytes.len() && bytes[i] != quote {
1376 i += 1;
1377 }
1378 let raw = &inside[value_start..i];
1379 let (base_start, base_line, base_col) = if can_map_to_source_bytes {
1380 let base_start = inside_base_start + value_start;
1381 let (base_line, base_col) = line_col_at_byte_offset(
1382 src.bytes.as_ref(),
1383 tag_start,
1384 tag_line,
1385 tag_col,
1386 base_start,
1387 );
1388 (base_start, base_line, base_col)
1389 } else {
1390 (tag_start, tag_line, tag_col)
1391 };
1392 let (decoded, decoded_errs) = decode_char_refs_with_errors(
1393 src.format, raw, true, base_start, base_line, base_col,
1394 );
1395 errs.extend(decoded_errs);
1396 value = Some(decoded);
1397 if i < bytes.len() && bytes[i] == quote {
1398 i += 1;
1399 }
1400 if i < bytes.len() && bytes[i].is_ascii_alphabetic() {
1402 push_parse_error(
1403 &mut errs,
1404 tag_span,
1405 "html.tokenizer.no_space_between_attrs",
1406 "No space between attributes.",
1407 );
1408 }
1409 } else {
1410 let value_start = i;
1411 while i < bytes.len() && !bytes[i].is_ascii_whitespace() {
1412 i += 1;
1413 }
1414 let raw = &inside[value_start..i];
1415 if raw.starts_with('`') {
1416 push_parse_error(
1417 &mut errs,
1418 tag_span,
1419 "html.tokenizer.backtick_at_start_unquoted",
1420 "“`” at the start of an unquoted attribute value. Probable cause: Using the wrong character as a quote.",
1421 );
1422 } else if raw.contains('`') {
1423 push_parse_error(
1424 &mut errs,
1425 tag_span,
1426 "html.tokenizer.backtick_in_unquoted",
1427 "“`” in an unquoted attribute value. Probable cause: Using the wrong character as a quote.",
1428 );
1429 }
1430 if raw.starts_with('<') {
1431 push_parse_error(
1432 &mut errs,
1433 tag_span,
1434 "html.tokenizer.lt_at_start_unquoted",
1435 "“<” at the start of an unquoted attribute value. Probable cause: Missing “>” immediately before.",
1436 );
1437 } else if raw.contains('<') {
1438 push_parse_error(
1439 &mut errs,
1440 tag_span,
1441 "html.tokenizer.lt_in_unquoted",
1442 "“<” in an unquoted attribute value. Probable cause: Missing “>” immediately before.",
1443 );
1444 }
1445 if raw.starts_with('=') {
1446 push_parse_error(
1447 &mut errs,
1448 tag_span,
1449 "html.tokenizer.equals_at_start_unquoted",
1450 "“=” at the start of an unquoted attribute value. Probable cause: Stray duplicate equals sign.",
1451 );
1452 }
1453 if raw.contains('"') {
1454 push_parse_error(
1455 &mut errs,
1456 tag_span,
1457 "html.tokenizer.quote_in_unquoted",
1458 "“\"” in an unquoted attribute value. Probable causes: Attributes running together or a URL query string in an unquoted attribute value.",
1459 );
1460 }
1461 let (base_start, base_line, base_col) = if can_map_to_source_bytes {
1462 let base_start = inside_base_start + value_start;
1463 let (base_line, base_col) = line_col_at_byte_offset(
1464 src.bytes.as_ref(),
1465 tag_start,
1466 tag_line,
1467 tag_col,
1468 base_start,
1469 );
1470 (base_start, base_line, base_col)
1471 } else {
1472 (tag_start, tag_line, tag_col)
1473 };
1474 let (decoded, decoded_errs) = decode_char_refs_with_errors(
1475 src.format, raw, true, base_start, base_line, base_col,
1476 );
1477 errs.extend(decoded_errs);
1478 value = Some(decoded);
1479 }
1480 }
1481
1482 let span = if can_map_to_source_bytes {
1483 let base_start = inside_base_start + attr_name_start;
1484 let base_end = inside_base_start + i;
1485 let (line, col) = line_col_at_byte_offset(
1486 src.bytes.as_ref(),
1487 tag_start,
1488 tag_line,
1489 tag_col,
1490 base_start,
1491 );
1492 Some(Span::new(base_start, base_end, line, col))
1493 } else {
1494 None
1495 };
1496 attrs.push(Attribute {
1497 name: attr_name,
1498 value,
1499 span,
1500 });
1501 }
1502
1503 if src.format == InputFormat::Html {
1505 let trimmed = inside.trim_end();
1506 if trimmed.ends_with('/') && trimmed.len() != inside.len() {
1507 push_parse_error(
1508 &mut errs,
1509 tag_span,
1510 "html.tokenizer.slash_not_immediately_followed_by_gt",
1511 "A slash was not immediately followed by “>”.",
1512 );
1513 }
1514 }
1515
1516 if src.format == InputFormat::Html && name.eq_ignore_ascii_case("image") {
1518 push_parse_error(
1519 &mut errs,
1520 tag_span,
1521 "html.tokenizer.image_start_tag",
1522 "Saw a start tag “image”.",
1523 );
1524 }
1525
1526 Ok((name, attrs, self_closing, errs))
1527}
1528
1529fn decode_char_refs(format: InputFormat, s: String, in_attribute: bool) -> String {
1530 if !s.contains('&') {
1531 return s;
1532 }
1533
1534 let bytes = s.as_bytes();
1535 let mut out = String::with_capacity(s.len());
1536 let mut i = 0usize;
1537 let mut last = 0usize;
1538 while i < bytes.len() {
1539 if bytes[i] != b'&' {
1540 i += 1;
1541 continue;
1542 }
1543
1544 out.push_str(&s[last..i]);
1545 let start = i;
1546 i += 1;
1547 if i >= bytes.len() {
1548 out.push('&');
1549 last = i;
1550 break;
1551 }
1552
1553 if bytes[i] == b'#' {
1554 let mut j = i + 1;
1555 let is_hex = j < bytes.len() && matches!(bytes[j], b'x' | b'X');
1556 if is_hex {
1557 j += 1;
1558 }
1559 let digits_start = j;
1560 if is_hex {
1561 while j < bytes.len() && bytes[j].is_ascii_hexdigit() {
1562 j += 1;
1563 }
1564 } else {
1565 while j < bytes.len() && bytes[j].is_ascii_digit() {
1566 j += 1;
1567 }
1568 }
1569 if digits_start == j {
1570 out.push('&');
1571 i = start + 1;
1572 last = i;
1573 continue;
1574 }
1575 let digits = &s[digits_start..j];
1576 let radix = if is_hex { 16 } else { 10 };
1577 let value = u32::from_str_radix(digits, radix).ok();
1578 if j < bytes.len() && bytes[j] == b';' {
1579 j += 1;
1580 }
1581 if let Some(cp) = value.and_then(valid_code_point) {
1582 out.push(cp);
1583 } else {
1584 out.push('\u{FFFD}');
1585 }
1586 i = j;
1587 last = i;
1588 continue;
1589 }
1590
1591 let mut j = i;
1593 let mut best: Option<(usize, &'static str)> = None;
1594 while j < bytes.len() {
1595 let b = bytes[j];
1596 if !(b.is_ascii_alphanumeric() || b == b';') {
1597 break;
1598 }
1599 j += 1;
1600 let cand = &s[i..j];
1601 if let Some(val) = resolve_named_ref(format, cand) {
1602 best = Some((j, val));
1603 }
1604 if b == b';' {
1605 break;
1606 }
1607 }
1608
1609 if let Some((end, val)) = best {
1610 let matched = &s[i..end];
1611 if in_attribute && !matched.ends_with(';') {
1612 let next = bytes.get(end).copied().unwrap_or(b' ');
1613 if next.is_ascii_alphanumeric() || next == b'=' {
1614 out.push('&');
1615 i = start + 1;
1616 last = i;
1617 continue;
1618 }
1619 }
1620 out.push_str(val);
1621 i = end;
1622 last = i;
1623 continue;
1624 }
1625
1626 out.push('&');
1628 i = start + 1;
1629 last = i;
1630 }
1631 out.push_str(&s[last..]);
1632 out
1633}
1634
1635fn decode_char_refs_with_errors(
1636 format: InputFormat,
1637 s: &str,
1638 in_attribute: bool,
1639 base_start: usize,
1640 base_line: u32,
1641 base_col: u32,
1642) -> (String, Vec<ParseEvent>) {
1643 if format != InputFormat::Html {
1644 return (s.to_string(), Vec::new());
1645 }
1646 let mut errs: Vec<ParseEvent> = Vec::new();
1647 let line_col_at =
1648 |byte_off: usize| line_col_at_byte_offset(s.as_bytes(), 0, base_line, base_col, byte_off);
1649 if let Some((byte_off, cp, byte_len)) = first_forbidden_code_point(s) {
1650 let (line, col) = line_col_at(byte_off);
1651 errs.push(ParseEvent::ParseError {
1652 code: "html.tokenizer.forbidden_code_point".to_string(),
1653 message: format!("Forbidden code point U+{:04x}.", cp),
1654 span: Some(Span::new(
1655 base_start + byte_off,
1656 base_start + byte_off + byte_len,
1657 line,
1658 col,
1659 )),
1660 });
1661 }
1662 if let Some((byte_off, byte_len)) = first_astral_noncharacter(s) {
1663 let (line, col) = line_col_at(byte_off);
1664 errs.push(ParseEvent::ParseError {
1665 code: "html.tokenizer.astral_noncharacter".to_string(),
1666 message: "Astral non-character.".to_string(),
1667 span: Some(Span::new(
1668 base_start + byte_off,
1669 base_start + byte_off + byte_len,
1670 line,
1671 col,
1672 )),
1673 });
1674 }
1675
1676 if !s.contains('&') {
1677 return (s.to_string(), errs);
1678 }
1679
1680 let bytes = s.as_bytes();
1681 let mut out = String::with_capacity(s.len());
1682 let mut i = 0usize;
1683 let mut last = 0usize;
1684 while i < bytes.len() {
1685 if bytes[i] != b'&' {
1686 i += 1;
1687 continue;
1688 }
1689
1690 out.push_str(&s[last..i]);
1691 let amp_off = i;
1692 i += 1;
1693 if i >= bytes.len() {
1694 out.push('&');
1695 last = i;
1696 break;
1697 }
1698
1699 if bytes[i] == b'#' {
1700 let mut j = i + 1;
1701 let is_hex = j < bytes.len() && matches!(bytes[j], b'x' | b'X');
1702 if is_hex {
1703 j += 1;
1704 }
1705 let digits_start = j;
1706 if is_hex {
1707 while j < bytes.len() && bytes[j].is_ascii_hexdigit() {
1708 j += 1;
1709 }
1710 } else {
1711 while j < bytes.len() && bytes[j].is_ascii_digit() {
1712 j += 1;
1713 }
1714 }
1715 if digits_start == j {
1716 let (line, col) = line_col_at(amp_off);
1717 errs.push(ParseEvent::ParseError {
1718 code: "html.tokenizer.charref_no_digits".to_string(),
1719 message: "No digits after “”.".to_string(),
1720 span: Some(Span::new(
1721 base_start + amp_off,
1722 base_start + amp_off + 1,
1723 line,
1724 col,
1725 )),
1726 });
1727 out.push('&');
1728 i = amp_off + 1;
1729 last = i;
1730 continue;
1731 }
1732 let digits = &s[digits_start..j];
1733 let radix = if is_hex { 16 } else { 10 };
1734 let value = u32::from_str_radix(digits, radix).unwrap_or(0);
1735 let had_semicolon = j < bytes.len() && bytes[j] == b';';
1736 if had_semicolon {
1737 j += 1;
1738 } else {
1739 let (line, col) = line_col_at(amp_off);
1740 errs.push(ParseEvent::ParseError {
1741 code: "html.tokenizer.charref_no_semicolon".to_string(),
1742 message: "Character reference was not terminated by a semicolon.".to_string(),
1743 span: Some(Span::new(
1744 base_start + amp_off,
1745 base_start + amp_off + 1,
1746 line,
1747 col,
1748 )),
1749 });
1750 }
1751
1752 let msg = classify_numeric_charref(value);
1753 if let Some((code, message)) = msg {
1754 let (line, col) = line_col_at(amp_off);
1755 errs.push(ParseEvent::ParseError {
1756 code: code.to_string(),
1757 message,
1758 span: Some(Span::new(
1759 base_start + amp_off,
1760 base_start + amp_off + 1,
1761 line,
1762 col,
1763 )),
1764 });
1765 }
1766
1767 if let Some(cp) = valid_code_point(value) {
1768 out.push(cp);
1769 } else {
1770 out.push('\u{FFFD}');
1771 }
1772 i = j;
1773 last = i;
1774 continue;
1775 }
1776
1777 let mut j = i;
1779 let mut best: Option<(usize, &'static str)> = None;
1780 while j < bytes.len() {
1781 let b = bytes[j];
1782 if !(b.is_ascii_alphanumeric() || b == b';') {
1783 break;
1784 }
1785 j += 1;
1786 let cand = &s[i..j];
1787 if let Some(val) = resolve_named_ref(format, cand) {
1788 best = Some((j, val));
1789 }
1790 if b == b';' {
1791 break;
1792 }
1793 }
1794
1795 if let Some((end, val)) = best {
1796 let matched = &s[i..end];
1797 if in_attribute && !matched.ends_with(';') {
1798 let next = bytes.get(end).copied().unwrap_or(b' ');
1799 if next.is_ascii_alphanumeric() || next == b'=' {
1800 out.push('&');
1801 i = amp_off + 1;
1802 last = i;
1803 continue;
1804 }
1805 }
1806 if !matched.ends_with(';') {
1807 let (line, col) = line_col_at(amp_off);
1808 errs.push(ParseEvent::ParseError {
1809 code: "html.tokenizer.named_charref_no_semicolon".to_string(),
1810 message: "Named character reference was not terminated by a semicolon. (Or “&” should have been escaped as “&”.)".to_string(),
1811 span: Some(Span::new(
1812 base_start + amp_off,
1813 base_start + amp_off + 1,
1814 line,
1815 col,
1816 )),
1817 });
1818 }
1819 out.push_str(val);
1820 i = end;
1821 last = i;
1822 continue;
1823 }
1824
1825 out.push('&');
1827 i = amp_off + 1;
1828 last = i;
1829 }
1830 out.push_str(&s[last..]);
1831
1832 (out, errs)
1833}
1834
1835fn classify_numeric_charref(cp: u32) -> Option<(&'static str, String)> {
1836 if cp == 0 {
1837 return Some((
1838 "html.tokenizer.charref_zero",
1839 "Character reference expands to zero.".to_string(),
1840 ));
1841 }
1842 if cp > 0x10FFFF {
1843 return Some((
1844 "html.tokenizer.charref_outside_range",
1845 "Character reference outside the permissible Unicode range.".to_string(),
1846 ));
1847 }
1848 if (0xD800..=0xDFFF).contains(&cp) {
1849 return Some((
1850 "html.tokenizer.charref_surrogate",
1851 "Character reference expands to a surrogate.".to_string(),
1852 ));
1853 }
1854 if cp == 0x0D {
1855 return Some((
1856 "html.tokenizer.charref_cr",
1857 "A numeric character reference expanded to carriage return.".to_string(),
1858 ));
1859 }
1860 if (0x80..=0x9F).contains(&cp) {
1861 return Some((
1862 "html.tokenizer.charref_c1_controls",
1863 "A numeric character reference expanded to the C1 controls range.".to_string(),
1864 ));
1865 }
1866 if cp > 0xFFFF && (cp & 0xFFFE) == 0xFFFE {
1867 return Some((
1868 "html.tokenizer.charref_astral_noncharacter",
1869 format!(
1870 "Character reference expands to an astral non-character (U+{:x}).",
1871 cp
1872 ),
1873 ));
1874 }
1875 if (cp & 0xFFFE) == 0xFFFE {
1876 return Some((
1877 "html.tokenizer.charref_noncharacter",
1878 format!(
1879 "Character reference expands to a non-character (U+{:04x}).",
1880 cp
1881 ),
1882 ));
1883 }
1884 if (0xFDD0..=0xFDEF).contains(&cp) {
1885 return Some((
1886 "html.tokenizer.charref_unassigned",
1887 "Character reference expands to a permanently unassigned code point.".to_string(),
1888 ));
1889 }
1890 if ((1..=0x1F).contains(&cp) && cp != 0x09 && cp != 0x0A && cp != 0x0C && cp != 0x0D)
1891 || cp == 0x7F
1892 {
1893 return Some((
1894 "html.tokenizer.charref_control",
1895 format!(
1896 "Character reference expands to a control character (U+{:04x}).",
1897 cp
1898 ),
1899 ));
1900 }
1901 None
1902}
1903
1904fn first_forbidden_code_point(s: &str) -> Option<(usize, u32, usize)> {
1905 let idx = s.as_bytes().iter().position(|&b| b == 0x0B)?;
1907 Some((idx, 0x000B, 1))
1908}
1909
1910fn first_astral_noncharacter(s: &str) -> Option<(usize, usize)> {
1911 for (idx, ch) in s.char_indices() {
1912 let cp = ch as u32;
1913 if cp > 0xFFFF && (cp & 0xFFFE) == 0xFFFE {
1914 return Some((idx, ch.len_utf8()));
1915 }
1916 }
1917 None
1918}
1919
1920fn resolve_named_ref(format: InputFormat, name: &str) -> Option<&'static str> {
1921 match (format, name) {
1922 (InputFormat::Html, _) => html_named_entity_map().get(name).copied(),
1923 (InputFormat::Xhtml, "lt;") => Some("<"),
1924 (InputFormat::Xhtml, "gt;") => Some(">"),
1925 (InputFormat::Xhtml, "amp;") => Some("&"),
1926 (InputFormat::Xhtml, "quot;") => Some("\""),
1927 (InputFormat::Xhtml, "apos;") => Some("'"),
1928 (InputFormat::Xhtml, _) => None,
1929 }
1930}
1931
1932#[cfg(test)]
1933mod resolve_named_ref_tests {
1934 use super::{InputFormat, resolve_named_ref};
1935
1936 #[test]
1937 fn xhtml_supports_only_predefined_named_entities() {
1938 assert_eq!(resolve_named_ref(InputFormat::Xhtml, "lt;"), Some("<"));
1939 assert_eq!(resolve_named_ref(InputFormat::Xhtml, "gt;"), Some(">"));
1940 assert_eq!(resolve_named_ref(InputFormat::Xhtml, "amp;"), Some("&"));
1941 assert_eq!(resolve_named_ref(InputFormat::Xhtml, "quot;"), Some("\""));
1942 assert_eq!(resolve_named_ref(InputFormat::Xhtml, "apos;"), Some("'"));
1943
1944 assert_eq!(resolve_named_ref(InputFormat::Xhtml, "copy;"), None);
1945 assert_eq!(resolve_named_ref(InputFormat::Xhtml, "amp"), None);
1946 }
1947}
1948
1949fn valid_code_point(cp: u32) -> Option<char> {
1950 if cp == 0 || cp > 0x10FFFF || (0xD800..=0xDFFF).contains(&cp) {
1952 None
1953 } else {
1954 char::from_u32(cp)
1955 }
1956}
1957
1958fn is_tag_name_char(b: u8) -> bool {
1959 b.is_ascii_alphanumeric() || b == b'-' || b == b':'
1960}
1961
1962fn starts_with_ascii_case_insensitive(haystack: &[u8], needle: &[u8]) -> bool {
1963 haystack.len() >= needle.len() && haystack[..needle.len()].eq_ignore_ascii_case(needle)
1964}
1965
1966fn find_tag_close(bytes: &[u8], from: usize) -> Option<usize> {
1967 #[derive(Clone, Copy, Debug)]
1968 enum State {
1969 TagName,
1970 BeforeAttrName,
1971 AttrName,
1972 AfterAttrName,
1973 BeforeAttrValue,
1974 AttrValueUnquoted,
1975 AttrValueQuoted(u8),
1976 }
1977
1978 let mut i = from;
1979 let mut state = State::TagName;
1980 while i < bytes.len() {
1981 let b = bytes[i];
1982 if b == b'>' && !matches!(state, State::AttrValueQuoted(_)) {
1983 return Some(i);
1984 }
1985 match state {
1986 State::TagName => {
1987 if b.is_ascii_whitespace() {
1988 state = State::BeforeAttrName;
1989 }
1990 }
1991 State::BeforeAttrName => {
1992 if b.is_ascii_whitespace() {
1993 } else if b == b'/' {
1995 } else {
1997 state = State::AttrName;
1998 }
1999 }
2000 State::AttrName => {
2001 if b.is_ascii_whitespace() {
2002 state = State::AfterAttrName;
2003 } else if b == b'=' {
2004 state = State::BeforeAttrValue;
2005 }
2006 }
2007 State::AfterAttrName => {
2008 if b.is_ascii_whitespace() {
2009 } else if b == b'=' {
2011 state = State::BeforeAttrValue;
2012 } else {
2013 state = State::AttrName;
2014 }
2015 }
2016 State::BeforeAttrValue => {
2017 if b.is_ascii_whitespace() {
2018 } else if b == b'"' || b == b'\'' {
2020 state = State::AttrValueQuoted(b);
2021 } else {
2022 state = State::AttrValueUnquoted;
2023 }
2024 }
2025 State::AttrValueUnquoted => {
2026 if b.is_ascii_whitespace() {
2027 state = State::BeforeAttrName;
2028 }
2029 }
2030 State::AttrValueQuoted(q) => {
2031 if b == q {
2032 state = State::BeforeAttrName;
2033 }
2034 }
2035 }
2036 i += 1;
2037 }
2038 None
2039}
2040
2041fn find_rawtext_end_tag(
2042 bytes: &[u8],
2043 from: usize,
2044 end_tag: &str,
2045 format: InputFormat,
2046) -> Option<usize> {
2047 let end_bytes = end_tag.as_bytes();
2048 let mut i = from;
2049 while i < bytes.len() {
2050 let off = memchr(b'<', &bytes[i..])?;
2051 let lt = i + off;
2052 if bytes.get(lt + 1) != Some(&b'/') {
2053 i = lt + 1;
2054 continue;
2055 }
2056 let name_start = lt + 2;
2057 if name_start + end_bytes.len() > bytes.len() {
2058 return None;
2059 }
2060 let candidate = &bytes[name_start..name_start + end_bytes.len()];
2061 let matches = match format {
2062 InputFormat::Html => candidate.eq_ignore_ascii_case(end_bytes),
2063 InputFormat::Xhtml => candidate == end_bytes,
2064 };
2065 if !matches {
2066 i = lt + 1;
2067 continue;
2068 }
2069 let after = bytes
2070 .get(name_start + end_bytes.len())
2071 .copied()
2072 .unwrap_or(b'>');
2073 if after.is_ascii_whitespace() || after == b'>' || after == b'/' {
2074 return Some(lt);
2075 }
2076 i = lt + 1;
2077 }
2078 None
2079}
2080
2081fn classify_start_tag_eof(rest: &[u8]) -> (String, String) {
2082 let mut quote: Option<u8> = None;
2084 for &b in rest {
2085 match (quote, b) {
2086 (None, b'\'' | b'"') => quote = Some(b),
2087 (Some(q), b) if b == q => quote = None,
2088 _ => {}
2089 }
2090 }
2091 if quote.is_some() {
2092 return (
2093 "html.tokenizer.eof_in_attr_value".to_string(),
2094 "End of file reached when inside an attribute value. Ignoring tag.".to_string(),
2095 );
2096 }
2097 (
2098 "html.tokenizer.eof_in_attr_name".to_string(),
2099 "End of file occurred in an attribute name. Ignoring tag.".to_string(),
2100 )
2101}
2102
2103fn memchr(needle: u8, haystack: &[u8]) -> Option<usize> {
2104 haystack.iter().position(|&b| b == needle)
2105}
2106
2107fn find_subslice(haystack: &[u8], from: usize, needle: &[u8]) -> Option<usize> {
2108 haystack[from..]
2109 .windows(needle.len())
2110 .position(|w| w == needle)
2111 .map(|off| from + off)
2112}
2113
2114fn html_named_entity_map() -> &'static FxHashMap<&'static str, &'static str> {
2115 static MAP: OnceLock<FxHashMap<&'static str, &'static str>> = OnceLock::new();
2116 MAP.get_or_init(|| {
2117 let mut map = FxHashMap::with_capacity_and_hasher(
2118 named_entities::HTML_NAMED_ENTITIES.len(),
2119 Default::default(),
2120 );
2121 map.extend(named_entities::HTML_NAMED_ENTITIES.iter().copied());
2122 map
2123 })
2124}
2125
2126#[cfg(test)]
2127mod entity_map_tests {
2128 use super::{html_named_entity_map, named_entities};
2129
2130 #[test]
2131 fn html_named_entity_map_contains_expected_entries() {
2132 let map = html_named_entity_map();
2133 assert_eq!(map.len(), named_entities::HTML_NAMED_ENTITIES.len());
2134 assert_eq!(map.get("AMP;"), Some(&"&"));
2135 assert_eq!(map.get("NegativeMediumSpace;"), Some(&"\u{200B}"));
2136 assert_eq!(map.get("NegativeThickSpace;"), Some(&"\u{200B}"));
2137 assert_eq!(map.get("NegativeThinSpace;"), Some(&"\u{200B}"));
2138 assert_eq!(map.get("NegativeVeryThinSpace;"), Some(&"\u{200B}"));
2139 assert_eq!(map.get("NoBreak;"), Some(&"\u{2060}"));
2140 assert_eq!(map.get("ZeroWidthSpace;"), Some(&"\u{200B}"));
2141 assert_eq!(map.get("shy;"), Some(&"\u{AD}"));
2142 assert_eq!(map.get("shy"), Some(&"\u{AD}"));
2143 }
2144}
2145
2146#[cfg(test)]
2147mod tests {
2148 use super::*;
2149 use html_inspector::EventSource;
2150
2151 fn collect(mut src: SimpleHtmlEventSource) -> Vec<ParseEvent> {
2152 let mut out = Vec::new();
2153 while let Some(ev) = src.next_event().unwrap() {
2154 out.push(ev);
2155 }
2156 out
2157 }
2158
2159 #[test]
2160 fn classify_start_tag_eof_prefers_attr_value_error_when_in_quote() {
2161 let (code, _msg) = classify_start_tag_eof(br#" class="unterminated"#);
2162 assert_eq!(code, "html.tokenizer.eof_in_attr_value");
2163
2164 let (code, _msg) = classify_start_tag_eof(br#" class="has'single"#);
2166 assert_eq!(code, "html.tokenizer.eof_in_attr_value");
2167 }
2168
2169 #[test]
2170 fn classify_start_tag_eof_returns_attr_name_error_when_not_in_quote() {
2171 let (code, _msg) = classify_start_tag_eof(b" class=foo");
2172 assert_eq!(code, "html.tokenizer.eof_in_attr_name");
2173
2174 let (code, _msg) = classify_start_tag_eof(br#" class="ok""#);
2176 assert_eq!(code, "html.tokenizer.eof_in_attr_name");
2177 }
2178
2179 #[test]
2180 fn bytes_at_cursor_is_false_when_out_of_bounds() {
2181 let mut src = SimpleHtmlEventSource::from_str("t", InputFormat::Html, "<");
2182 assert!(src.bytes_at_cursor(b""));
2183 assert!(src.bytes_at_cursor(b"<"));
2184 assert!(!src.bytes_at_cursor(b"<>"));
2185 src.cursor = src.bytes.len();
2186 assert!(src.bytes_at_cursor(b""));
2187 assert!(!src.bytes_at_cursor(b"<"));
2188 src.cursor = src.bytes.len() + 1;
2189 assert!(!src.bytes_at_cursor(b""));
2190 assert!(!src.bytes_at_cursor(b"<"));
2191 }
2192
2193 #[test]
2194 fn valid_code_point_matches_html_scalar_value_constraints() {
2195 assert_eq!(valid_code_point(0), None);
2196 assert_eq!(valid_code_point(0xD800), None);
2197 assert_eq!(valid_code_point(0xDFFF), None);
2198 assert_eq!(valid_code_point(0x110000), None);
2199
2200 assert_eq!(valid_code_point(0x41), Some('A'));
2201 assert_eq!(
2202 valid_code_point(0x10FFFF),
2203 Some(char::from_u32(0x10FFFF).unwrap())
2204 );
2205 }
2206
2207 #[test]
2208 fn first_forbidden_code_point_finds_vertical_tab_by_byte_offset() {
2209 assert_eq!(first_forbidden_code_point("abc"), None);
2210
2211 let s = "❤\u{000B}x";
2212 let (idx, cp, len) = first_forbidden_code_point(s).unwrap();
2213 assert_eq!(idx, "❤".len());
2214 assert_eq!(cp, 0x000B);
2215 assert_eq!(len, 1);
2216 }
2217
2218 #[test]
2219 fn first_astral_noncharacter_finds_noncharacters_and_reports_byte_len() {
2220 assert_eq!(first_astral_noncharacter("abc"), None);
2221
2222 let ch = char::from_u32(0x1FFFE).unwrap();
2223 let s = format!("a{ch}b");
2224 let (idx, len) = first_astral_noncharacter(&s).unwrap();
2225 assert_eq!(idx, 1);
2226 assert_eq!(len, ch.len_utf8());
2227 }
2228
2229 #[test]
2230 fn normalize_name_lowercases_ascii_in_html_only() {
2231 let html = SimpleHtmlEventSource::from_str("t", InputFormat::Html, "");
2232 assert_eq!(html.normalize_name("DiV❤"), "div❤");
2233 assert_eq!(html.normalize_name("div"), "div");
2234
2235 let xhtml = SimpleHtmlEventSource::from_str("t", InputFormat::Xhtml, "");
2236 assert_eq!(xhtml.normalize_name("DiV❤"), "DiV❤");
2237 }
2238
2239 #[test]
2240 fn normalize_name_accepts_cow_without_extra_allocation_for_owned() {
2241 let html = SimpleHtmlEventSource::from_str("t", InputFormat::Html, "");
2242 assert_eq!(
2243 html.normalize_name(std::borrow::Cow::Borrowed("DiV")),
2244 html.normalize_name("DiV")
2245 );
2246 assert_eq!(
2247 html.normalize_name(std::borrow::Cow::Owned("DiV".to_string())),
2248 html.normalize_name("DiV")
2249 );
2250
2251 let s = "DiV".to_string();
2252 let ptr = s.as_ptr();
2253 let cap = s.capacity();
2254 let out = html.normalize_name(std::borrow::Cow::Owned(s));
2255 assert_eq!(out, "div");
2256 assert_eq!(out.as_ptr(), ptr);
2257 assert_eq!(out.capacity(), cap);
2258
2259 let s = "div".to_string();
2260 let ptr = s.as_ptr();
2261 let cap = s.capacity();
2262 let out = html.normalize_name(s);
2263 assert_eq!(out, "div");
2264 assert_eq!(out.as_ptr(), ptr);
2265 assert_eq!(out.capacity(), cap);
2266
2267 let xhtml = SimpleHtmlEventSource::from_str("t", InputFormat::Xhtml, "");
2268 assert_eq!(
2269 xhtml.normalize_name(std::borrow::Cow::Borrowed("DiV")),
2270 xhtml.normalize_name("DiV")
2271 );
2272 assert_eq!(
2273 xhtml.normalize_name(std::borrow::Cow::Owned("DiV".to_string())),
2274 xhtml.normalize_name("DiV")
2275 );
2276
2277 let s = "DiV".to_string();
2278 let ptr = s.as_ptr();
2279 let cap = s.capacity();
2280 let out = xhtml.normalize_name(std::borrow::Cow::Owned(s));
2281 assert_eq!(out, "DiV");
2282 assert_eq!(out.as_ptr(), ptr);
2283 assert_eq!(out.capacity(), cap);
2284 }
2285
2286 #[test]
2287 fn next_event_drains_pending_even_if_finished() {
2288 let mut src = SimpleHtmlEventSource::from_str("t", InputFormat::Html, "");
2289 src.finished = true;
2290 src.pending.push_back(ParseEvent::Text {
2291 text: "x".to_string(),
2292 span: None,
2293 });
2294 assert!(matches!(
2295 src.next_event().unwrap(),
2296 Some(ParseEvent::Text { ref text, .. }) if text == "x"
2297 ));
2298 assert!(src.next_event().unwrap().is_none());
2299 }
2300
2301 #[test]
2302 fn pop_open_element_truncates_stacks_at_last_match() {
2303 let mut src = SimpleHtmlEventSource::from_str("t", InputFormat::Html, "");
2304 src.open_elements = vec![
2305 "a".to_string(),
2306 "b".to_string(),
2307 "B".to_string(),
2308 "c".to_string(),
2309 ];
2310 src.open_namespaces = vec![
2311 HtmlNamespace::Html,
2312 HtmlNamespace::Html,
2313 HtmlNamespace::Svg,
2314 HtmlNamespace::Math,
2315 ];
2316 src.pop_open_element("b");
2317 assert_eq!(src.open_elements, vec!["a".to_string(), "b".to_string()]);
2318 assert_eq!(
2319 src.open_namespaces,
2320 vec![HtmlNamespace::Html, HtmlNamespace::Html]
2321 );
2322
2323 let mut src = SimpleHtmlEventSource::from_str("t", InputFormat::Xhtml, "");
2324 src.open_elements = vec![
2325 "a".to_string(),
2326 "b".to_string(),
2327 "B".to_string(),
2328 "c".to_string(),
2329 ];
2330 src.open_namespaces = vec![
2331 HtmlNamespace::Html,
2332 HtmlNamespace::Html,
2333 HtmlNamespace::Svg,
2334 HtmlNamespace::Math,
2335 ];
2336 src.pop_open_element("b");
2337 assert_eq!(src.open_elements, vec!["a".to_string()]);
2338 assert_eq!(src.open_namespaces, vec![HtmlNamespace::Html]);
2339 }
2340
2341 #[test]
2342 fn pop_open_element_is_noop_when_name_is_missing() {
2343 let mut src = SimpleHtmlEventSource::from_str("t", InputFormat::Html, "");
2344 src.open_elements = vec!["a".to_string(), "b".to_string()];
2345 src.open_namespaces = vec![HtmlNamespace::Html, HtmlNamespace::Svg];
2346 src.pop_open_element("missing");
2347 assert_eq!(src.open_elements, vec!["a".to_string(), "b".to_string()]);
2348 assert_eq!(
2349 src.open_namespaces,
2350 vec![HtmlNamespace::Html, HtmlNamespace::Svg]
2351 );
2352 }
2353
2354 fn as_start_tag(ev: &ParseEvent) -> Option<(&str, &[html_inspector::Attribute])> {
2355 match ev {
2356 ParseEvent::StartTag { name, attrs, .. } => Some((name, attrs.as_slice())),
2357 _ => None,
2358 }
2359 }
2360
2361 #[test]
2362 fn str_from_bytes_lossy_borrows_valid_utf8() {
2363 let s = str_from_bytes_lossy(b"hello");
2364 assert!(matches!(s, Cow::Borrowed(_)));
2365 assert_eq!(s.as_ref(), "hello");
2366 }
2367
2368 #[test]
2369 fn str_from_bytes_lossy_allocates_on_invalid_utf8() {
2370 let s = str_from_bytes_lossy(&[0xff, b'a']);
2371 assert!(matches!(s, Cow::Owned(_)));
2372 assert_eq!(s.as_ref(), "�a");
2373 }
2374
2375 #[test]
2376 fn doctype_name_normalizes_even_when_decoding_allocates() {
2377 let src = SimpleHtmlEventSource::from_bytes(
2378 "t",
2379 InputFormat::Html,
2380 vec![
2381 b'<', b'!', b'D', b'O', b'C', b'T', b'Y', b'P', b'E', b' ', 0xff, b'A', b'>',
2382 ],
2383 );
2384 let evs = collect(src);
2385 let name = evs.iter().find_map(|e| match e {
2386 ParseEvent::Doctype {
2387 name: Some(name), ..
2388 } => Some(name.as_str()),
2389 _ => None,
2390 });
2391 assert_eq!(name, Some("�a"));
2392 }
2393
2394 #[test]
2395 fn treats_lt_not_followed_by_tag_name_as_text() {
2396 let src = SimpleHtmlEventSource::from_str("t", InputFormat::Html, "<$");
2397 let evs = collect(src);
2398 assert!(
2399 matches!(evs[0], ParseEvent::ParseError { ref code, .. } if code == "html.tokenizer.bad_char_after_lt")
2400 );
2401 assert!(matches!(evs[1], ParseEvent::Text { ref text, .. } if text == "<$"));
2402 }
2403
2404 #[test]
2405 fn lt_at_eof_emits_tokenizer_eof_after_lt_error() {
2406 let src = SimpleHtmlEventSource::from_str("t", InputFormat::Html, "<");
2407 let evs = collect(src);
2408 assert_eq!(evs.len(), 1);
2409 let ParseEvent::ParseError {
2410 code,
2411 message,
2412 span,
2413 } = &evs[0]
2414 else {
2415 panic!("expected a parse error event");
2416 };
2417 assert_eq!(code, "html.tokenizer.eof_after_lt");
2418 assert_eq!(message, "End of file after “<”.");
2419 assert_eq!(span.unwrap(), Span::new(0, 1, 1, 1));
2420 }
2421
2422 #[test]
2423 fn parses_tag_end_ignoring_gt_inside_quotes() {
2424 let src = SimpleHtmlEventSource::from_str("t", InputFormat::Html, "<a title=\">\">x</a>");
2425 let evs = collect(src);
2426 let (name, attrs) = as_start_tag(&evs[0]).unwrap();
2427 assert_eq!(name, "a");
2428 assert_eq!(attrs.len(), 1);
2429 assert_eq!(attrs[0].name, "title");
2430 assert_eq!(attrs[0].value.as_deref(), Some(">"));
2431 }
2432
2433 #[test]
2434 fn lt_followed_by_whitespace_is_literal_text_without_error() {
2435 let src = SimpleHtmlEventSource::from_str("t", InputFormat::Html, "< ");
2436 let evs = collect(src);
2437 assert!(matches!(evs[0], ParseEvent::Text { ref text, .. } if text == "< "));
2438 }
2439
2440 #[test]
2441 fn xhtml_processing_instruction_skips_leading_whitespace_in_target() {
2442 let src = SimpleHtmlEventSource::from_str(
2443 "t",
2444 InputFormat::Xhtml,
2445 "<? xml-stylesheet href=\"a\"?>",
2446 );
2447 let evs = collect(src);
2448 assert!(evs.iter().any(|e| matches!(
2449 e,
2450 ParseEvent::ProcessingInstruction { target, data, .. }
2451 if target == "xml-stylesheet" && data == "href=\"a\""
2452 )));
2453 }
2454
2455 #[test]
2456 fn doctype_public_and_system_missing_ids_emit_expected_errors() {
2457 let src = SimpleHtmlEventSource::from_str("t", InputFormat::Html, "<!DOCTYPE html PUBLIC>");
2458 let evs = collect(src);
2459 assert!(evs.iter().any(|e| matches!(
2460 e,
2461 ParseEvent::ParseError { code, .. } if code == "html.tokenizer.doctype.expected_public_id"
2462 )));
2463
2464 let src = SimpleHtmlEventSource::from_str("t", InputFormat::Html, "<!DOCTYPE html SYSTEM>");
2465 let evs = collect(src);
2466 assert!(evs.iter().any(|e| matches!(
2467 e,
2468 ParseEvent::ParseError { code, .. } if code == "html.tokenizer.doctype.expected_system_id"
2469 )));
2470 }
2471
2472 #[test]
2473 fn doctype_allows_whitespace_after_bang_and_newlines_affect_span_tracking() {
2474 let src = SimpleHtmlEventSource::from_str("t", InputFormat::Html, "<! DOCTYPE html>");
2475 let evs = collect(src);
2476 assert!(evs.iter().any(|e| matches!(e, ParseEvent::Doctype { .. })));
2477
2478 let src = SimpleHtmlEventSource::from_str(
2479 "t",
2480 InputFormat::Html,
2481 "<!DOCTYPE html PUBLIC \"a\\n>\" \"sys\">",
2482 );
2483 let evs = collect(src);
2484 assert!(evs.iter().any(|e| matches!(
2485 e,
2486 ParseEvent::ParseError { code, .. } if code == "html.tokenizer.doctype.gt_in_public_id"
2487 )));
2488 }
2489
2490 #[test]
2491 fn parse_start_tag_allows_leading_whitespace_in_inside_buffer() {
2492 let src = SimpleHtmlEventSource::from_str("t", InputFormat::Html, "");
2493 let (name, _attrs, _sc, _errs) = parse_start_tag(&src, " a", 0, 1, 1, 0).unwrap();
2494 assert_eq!(name, "a");
2495 }
2496
2497 #[test]
2498 fn parse_start_tag_errors_use_the_full_tag_span() {
2499 let src = SimpleHtmlEventSource::from_str("t", InputFormat::Html, "<a x==y ></a>");
2500 let evs = collect(src);
2501 let span = evs.iter().find_map(|e| match e {
2502 ParseEvent::ParseError {
2503 code,
2504 span: Some(span),
2505 ..
2506 } if code == "html.tokenizer.equals_at_start_unquoted" => Some(*span),
2507 _ => None,
2508 });
2509 let span = span.expect("expected equals_at_start_unquoted error");
2510 assert_eq!(span.byte_start, 0);
2511 assert_eq!(span.byte_end, 9);
2512 assert_eq!(span.line, 1);
2513 assert_eq!(span.col, 1);
2514 }
2515
2516 #[test]
2517 fn multiple_spaces_after_attr_name_are_accepted() {
2518 let src = SimpleHtmlEventSource::from_str("t", InputFormat::Html, "<a x =\"y\"></a>");
2519 let evs = collect(src);
2520 let (_name, attrs) = as_start_tag(&evs[0]).unwrap();
2521 let x = attrs.iter().find(|a| a.name == "x").unwrap();
2522 assert_eq!(x.value.as_deref(), Some("y"));
2523 }
2524
2525 #[test]
2526 fn rawtext_end_tag_search_can_exhaust_input() {
2527 assert_eq!(
2528 find_rawtext_end_tag(b"<<<<", 0, "script", InputFormat::Html),
2529 None
2530 );
2531 }
2532
2533 #[test]
2534 fn quoted_attributes_without_space_emit_no_space_between_attrs_error() {
2535 let src =
2536 SimpleHtmlEventSource::from_str("t", InputFormat::Html, "<a title=\"x\"id=\"y\"></a>");
2537 let evs = collect(src);
2538 assert!(evs.iter().any(|e| matches!(
2539 e,
2540 ParseEvent::ParseError { code, .. } if code == "html.tokenizer.no_space_between_attrs"
2541 )));
2542 let (name, attrs) = evs.iter().find_map(as_start_tag).unwrap();
2543 assert_eq!(name, "a");
2544 assert!(attrs.iter().any(|a| a.name == "title"));
2545 assert!(attrs.iter().any(|a| a.name == "id"));
2546 }
2547
2548 #[test]
2549 fn classify_start_tag_eof_covers_mixed_quote_tracking() {
2550 let (code, _msg) = classify_start_tag_eof(b" a='x\"");
2551 assert_eq!(code, "html.tokenizer.eof_in_attr_value");
2552 }
2553
2554 #[test]
2555 fn classify_start_tag_eof_tracks_double_quotes() {
2556 let (code, _msg) = classify_start_tag_eof(br#" a="b"#);
2557 assert_eq!(code, "html.tokenizer.eof_in_attr_value");
2558
2559 let (code, _msg) = classify_start_tag_eof(br#" a="b" c"#);
2560 assert_eq!(code, "html.tokenizer.eof_in_attr_name");
2561 }
2562
2563 #[test]
2564 fn start_tag_helper_returns_none_for_non_start_tag() {
2565 assert!(
2566 as_start_tag(&ParseEvent::Text {
2567 text: "x".to_string(),
2568 span: None,
2569 })
2570 .is_none()
2571 );
2572 }
2573
2574 #[test]
2575 fn rawtext_script_does_not_tokenize_lt() {
2576 let src = SimpleHtmlEventSource::from_str(
2577 "t",
2578 InputFormat::Html,
2579 "<script>if (a < b) {}</script>",
2580 );
2581 let evs = collect(src);
2582 assert!(matches!(evs[0], ParseEvent::StartTag { ref name, .. } if name == "script"));
2583 assert!(matches!(evs[1], ParseEvent::Text { ref text, .. } if text == "if (a < b) {}"));
2584 assert!(matches!(evs[2], ParseEvent::EndTag { ref name, .. } if name == "script"));
2585 }
2586
2587 #[test]
2588 fn rcdata_textarea_does_not_tokenize_lt_but_decodes_entities() {
2589 let src = SimpleHtmlEventSource::from_str(
2590 "t",
2591 InputFormat::Html,
2592 "<textarea>1 < 2 < 3</textarea>",
2593 );
2594 let evs = collect(src);
2595 assert!(matches!(evs[0], ParseEvent::StartTag { ref name, .. } if name == "textarea"));
2596 assert!(matches!(evs[1], ParseEvent::Text { ref text, .. } if text == "1 < 2 < 3"));
2597 assert!(matches!(evs[2], ParseEvent::EndTag { ref name, .. } if name == "textarea"));
2598 }
2599
2600 #[test]
2601 fn rcdata_title_decodes_entities() {
2602 let src =
2603 SimpleHtmlEventSource::from_str("t", InputFormat::Html, "<title>1 < 2</title>");
2604 let evs = collect(src);
2605 assert!(matches!(evs[0], ParseEvent::StartTag { ref name, .. } if name == "title"));
2606 assert!(matches!(evs[1], ParseEvent::Text { ref text, .. } if text == "1 < 2"));
2607 assert!(matches!(evs[2], ParseEvent::EndTag { ref name, .. } if name == "title"));
2608 }
2609
2610 #[test]
2611 fn plaintext_consumes_rest_of_document_as_text() {
2612 let src = SimpleHtmlEventSource::from_str("t", InputFormat::Html, "<plaintext><b>hi</b>");
2613 let evs = collect(src);
2614 assert!(matches!(evs[0], ParseEvent::StartTag { ref name, .. } if name == "plaintext"));
2615 assert!(matches!(evs[1], ParseEvent::Text { ref text, .. } if text == "<b>hi</b>"));
2616 assert_eq!(evs.len(), 2);
2617 }
2618
2619 #[test]
2620 fn xhtml_cdata_emits_text() {
2621 let src = SimpleHtmlEventSource::from_str("t", InputFormat::Xhtml, "<![CDATA[<tag>]]>");
2622 let evs = collect(src);
2623 assert_eq!(
2624 evs,
2625 vec![ParseEvent::Text {
2626 text: "<tag>".to_string(),
2627 span: Some(Span::new(0, 17, 1, 1)),
2628 }]
2629 );
2630 }
2631
2632 #[test]
2633 fn html_cdata_outside_foreign_content_is_bogus_comment() {
2634 let src = SimpleHtmlEventSource::from_str("t", InputFormat::Html, "<![CDATA[<tag>]]>");
2635 let evs = collect(src);
2636 assert!(evs.iter().any(|e| matches!(
2637 e,
2638 ParseEvent::ParseError { code, .. } if code == "html.tokenizer.bogus_comment"
2639 )));
2640 assert!(evs.iter().any(|e| matches!(e, ParseEvent::Comment { .. })));
2641 }
2642
2643 #[test]
2644 fn html_cdata_inside_svg_emits_text_without_bogus_comment_error() {
2645 let src = SimpleHtmlEventSource::from_str(
2646 "t",
2647 InputFormat::Html,
2648 "<!--<!-- --><svg><script><![CDATA[if (a < b) {}]]></script></svg>",
2649 );
2650 let evs = collect(src);
2651 assert!(!evs.iter().any(|e| matches!(
2652 e,
2653 ParseEvent::ParseError { code, .. } if code == "html.tokenizer.bogus_comment"
2654 )));
2655 assert!(evs.iter().any(|e| matches!(
2656 e,
2657 ParseEvent::Text { text, .. } if text.contains("if (a < b) {}")
2658 )));
2659 }
2660
2661 #[test]
2662 fn doctype_parses_public_and_system_ids() {
2663 let src = SimpleHtmlEventSource::from_str(
2664 "t",
2665 InputFormat::Html,
2666 "<!DOCTYPE html PUBLIC \"pub\" 'sys'><html></html>",
2667 );
2668 let evs = collect(src);
2669 let (name, public_id, system_id) = evs
2670 .iter()
2671 .find_map(|e| match e {
2672 ParseEvent::Doctype {
2673 name,
2674 public_id,
2675 system_id,
2676 ..
2677 } => Some((name.clone(), public_id.clone(), system_id.clone())),
2678 _ => None,
2679 })
2680 .expect("expected a doctype event");
2681 assert_eq!(name.as_deref(), Some("html"));
2682 assert_eq!(public_id.as_deref(), Some("pub"));
2683 assert_eq!(system_id.as_deref(), Some("sys"));
2684 }
2685
2686 #[test]
2687 fn decodes_basic_entities_in_text_and_attributes() {
2688 let src = SimpleHtmlEventSource::from_str(
2689 "t",
2690 InputFormat::Html,
2691 "<p title=\"a < b\">Tom & Jerry</p>",
2692 );
2693 let evs = collect(src);
2694 let (_name, attrs) = as_start_tag(&evs[0]).unwrap();
2695 let title = attrs.iter().find(|a| a.name == "title").unwrap();
2696 assert_eq!(title.value.as_deref(), Some("a < b"));
2697 assert!(matches!(evs[1], ParseEvent::Text { ref text, .. } if text == "Tom & Jerry"));
2698 }
2699
2700 #[test]
2701 fn named_char_ref_without_semicolon_emits_error_and_decodes_in_text() {
2702 let src = SimpleHtmlEventSource::from_str("t", InputFormat::Html, "<p>©</p>");
2703 let evs = collect(src);
2704 assert!(evs.iter().any(|e| matches!(
2705 e,
2706 ParseEvent::ParseError { code, .. } if code == "html.tokenizer.named_charref_no_semicolon"
2707 )));
2708 assert!(evs.iter().any(|e| matches!(
2709 e,
2710 ParseEvent::Text { text, .. } if text == "©"
2711 )));
2712 }
2713
2714 #[test]
2715 fn named_char_ref_without_semicolon_span_matches_ampersand_location_in_text() {
2716 let html = "<p>a ©=1</p>";
2717 let src = SimpleHtmlEventSource::from_str("t", InputFormat::Html, html);
2718 let evs = collect(src);
2719 let span = evs
2720 .iter()
2721 .find_map(|e| match e {
2722 ParseEvent::ParseError {
2723 code,
2724 span: Some(span),
2725 ..
2726 } if code == "html.tokenizer.named_charref_no_semicolon" => Some(*span),
2727 _ => None,
2728 })
2729 .expect("expected named_charref_no_semicolon parse error");
2730
2731 let amp = html.find('&').expect("expected '&' in HTML");
2732 assert_eq!(span.byte_start, amp);
2733 assert_eq!(span.byte_end, amp + 1);
2734 assert_eq!(span.line, 1);
2735 assert_eq!(span.col, (amp + 1) as u32);
2736 }
2737
2738 #[test]
2739 fn named_char_ref_without_semicolon_span_matches_ampersand_location_in_attribute() {
2740 let html = "<p title=\"©.\">x</p>";
2741 let src = SimpleHtmlEventSource::from_str("t", InputFormat::Html, html);
2742 let evs = collect(src);
2743 let span = evs
2744 .iter()
2745 .find_map(|e| match e {
2746 ParseEvent::ParseError {
2747 code,
2748 span: Some(span),
2749 ..
2750 } if code == "html.tokenizer.named_charref_no_semicolon" => Some(*span),
2751 _ => None,
2752 })
2753 .expect("expected named_charref_no_semicolon parse error");
2754
2755 let amp = html.find('&').expect("expected '&' in HTML");
2756 assert_eq!(span.byte_start, amp);
2757 assert_eq!(span.byte_end, amp + 1);
2758 assert_eq!(span.line, 1);
2759 assert_eq!(span.col, (amp + 1) as u32);
2760 }
2761
2762 #[test]
2763 fn named_char_ref_without_semicolon_not_decoded_in_attribute_when_followed_by_equals() {
2764 let src = SimpleHtmlEventSource::from_str(
2765 "t",
2766 InputFormat::Html,
2767 "<!--<!-- --><a title=\"©=1\"></a>",
2768 );
2769 let evs = collect(src);
2770 assert!(!evs.iter().any(|e| matches!(
2771 e,
2772 ParseEvent::ParseError { code, .. } if code == "html.tokenizer.named_charref_no_semicolon"
2773 )));
2774 let (name, attrs) = evs.iter().find_map(as_start_tag).unwrap();
2775 assert_eq!(name, "a");
2776 let title = attrs.iter().find(|a| a.name == "title").unwrap();
2777 assert_eq!(title.value.as_deref(), Some("©=1"));
2778 }
2779
2780 #[test]
2781 fn numeric_char_ref_without_semicolon_emits_error_and_decodes() {
2782 let src = SimpleHtmlEventSource::from_str("t", InputFormat::Html, "<p>A</p>");
2783 let evs = collect(src);
2784 assert!(evs.iter().any(|e| matches!(
2785 e,
2786 ParseEvent::ParseError { code, .. } if code == "html.tokenizer.charref_no_semicolon"
2787 )));
2788 assert!(evs.iter().any(|e| matches!(
2789 e,
2790 ParseEvent::Text { text, .. } if text == "A"
2791 )));
2792 }
2793
2794 #[test]
2795 fn numeric_char_ref_zero_emits_error_and_replacement_char() {
2796 let src = SimpleHtmlEventSource::from_str("t", InputFormat::Html, "<p>�</p>");
2797 let evs = collect(src);
2798 assert!(evs.iter().any(|e| matches!(
2799 e,
2800 ParseEvent::ParseError { code, .. } if code == "html.tokenizer.charref_zero"
2801 )));
2802 assert!(evs.iter().any(|e| matches!(
2803 e,
2804 ParseEvent::Text { text, .. } if text == "\u{FFFD}"
2805 )));
2806 }
2807
2808 #[test]
2809 fn forbidden_code_point_in_text_emits_error() {
2810 let src = SimpleHtmlEventSource::from_str("t", InputFormat::Html, "<p>\u{000B}</p>");
2811 let evs = collect(src);
2812 assert!(evs.iter().any(|e| matches!(
2813 e,
2814 ParseEvent::ParseError { code, .. } if code == "html.tokenizer.forbidden_code_point"
2815 )));
2816 }
2817
2818 #[test]
2819 fn astral_noncharacter_in_text_emits_error() {
2820 let src = SimpleHtmlEventSource::from_str("t", InputFormat::Html, "<p>\u{10FFFE}</p>");
2821 let evs = collect(src);
2822 assert!(evs.iter().any(|e| matches!(
2823 e,
2824 ParseEvent::ParseError { code, .. } if code == "html.tokenizer.astral_noncharacter"
2825 )));
2826 }
2827
2828 #[test]
2829 fn nested_comment_emits_parse_error() {
2830 let html = "<!-- a <!-- b -->";
2831 let src = SimpleHtmlEventSource::from_str("t", InputFormat::Html, html);
2832 let evs = collect(src);
2833 let span = evs
2834 .iter()
2835 .find_map(|e| match e {
2836 ParseEvent::ParseError {
2837 code,
2838 span: Some(span),
2839 ..
2840 } if code == "html.tokenizer.nested_comment" => Some(*span),
2841 _ => None,
2842 })
2843 .expect("expected nested_comment parse error");
2844 let nested = html.rfind("<!--").expect("expected nested '<!--' in HTML");
2845 assert_eq!(span.byte_start, nested);
2846 assert_eq!(span.byte_end, nested + "<!--".len());
2847 assert_eq!(span.line, 1);
2848 assert_eq!(span.col, (nested + 1) as u32);
2849 assert!(
2850 evs.iter()
2851 .any(|e| matches!(e, ParseEvent::Comment { text, .. } if text.contains("a")))
2852 );
2853 }
2854
2855 #[test]
2856 fn html_processing_instruction_emits_error_and_comment() {
2857 let src = SimpleHtmlEventSource::from_str(
2858 "t",
2859 InputFormat::Html,
2860 "<?xml version=\"1.0\"?><p>x</p>",
2861 );
2862 let evs = collect(src);
2863 assert!(evs.iter().any(|e| matches!(
2864 e,
2865 ParseEvent::ParseError { code, .. } if code == "html.tokenizer.processing_instruction"
2866 )));
2867 assert!(evs.iter().any(|e| matches!(
2868 e,
2869 ParseEvent::Comment { text, .. } if text.contains("xml version")
2870 )));
2871 }
2872
2873 #[test]
2874 fn xhtml_processing_instruction_is_emitted_as_processing_instruction_event() {
2875 let src = SimpleHtmlEventSource::from_str(
2876 "t",
2877 InputFormat::Xhtml,
2878 "<?xml-stylesheet href=\"a.css\" type=\"text/css\"?><root/>",
2879 );
2880 let evs = collect(src);
2881 assert!(matches!(
2882 evs[0],
2883 ParseEvent::ProcessingInstruction { ref target, ref data, .. }
2884 if target == "xml-stylesheet" && data.contains("href=\"a.css\"")
2885 ));
2886 }
2887
2888 #[test]
2889 fn doctype_missing_space_before_name_emits_error() {
2890 let src = SimpleHtmlEventSource::from_str("t", InputFormat::Html, "<!DOCTYPEhtml><p>x</p>");
2891 let evs = collect(src);
2892 assert!(evs.iter().any(|e| matches!(
2893 e,
2894 ParseEvent::ParseError { code, .. } if code == "html.tokenizer.doctype.missing_space_before_name"
2895 )));
2896 }
2897
2898 #[test]
2899 fn html_event_source_wrapper_uses_simple_backend_for_xhtml() {
2900 let mut src = HtmlEventSource::from_str("t", InputFormat::Xhtml, "<root/>").unwrap();
2901 assert_eq!(src.source_name(), "t");
2902 assert_eq!(src.format(), InputFormat::Xhtml);
2903 let mut evs = Vec::new();
2904 while let Some(ev) = src.next_event().unwrap() {
2905 evs.push(ev);
2906 }
2907 assert!(
2908 evs.iter()
2909 .any(|e| matches!(e, ParseEvent::StartTag { name, .. } if name == "root"))
2910 );
2911 }
2912
2913 #[test]
2914 fn foreignobject_in_svg_switches_insertion_namespace_to_html() {
2915 let src = SimpleHtmlEventSource::from_str(
2916 "t",
2917 InputFormat::Html,
2918 "<svg><foreignObject><p>hi</p></foreignObject></svg>",
2919 );
2920 let evs = collect(src);
2921 assert!(evs.iter().any(|e| matches!(e, ParseEvent::StartTag { name, .. } if name.eq_ignore_ascii_case("foreignobject"))));
2922 assert!(
2923 evs.iter()
2924 .any(|e| matches!(e, ParseEvent::StartTag { name, .. } if name == "p"))
2925 );
2926 }
2927
2928 #[test]
2929 fn unterminated_cdata_emits_xml_cdata_eof_error() {
2930 let src = SimpleHtmlEventSource::from_str("t", InputFormat::Xhtml, "<![CDATA[unterminated");
2931 let evs = collect(src);
2932 assert!(evs.iter().any(|e| matches!(
2933 e,
2934 ParseEvent::ParseError { code, .. } if code == "xml.cdata_eof"
2935 )));
2936 }
2937
2938 #[test]
2939 fn bogus_comment_without_gt_emits_comment_then_finishes() {
2940 let src = SimpleHtmlEventSource::from_str("t", InputFormat::Html, "<?xml");
2941 let evs = collect(src);
2942 assert!(evs.iter().any(|e| matches!(
2943 e,
2944 ParseEvent::ParseError { code, .. } if code == "html.tokenizer.processing_instruction"
2945 )));
2946 assert!(
2947 evs.iter()
2948 .any(|e| matches!(e, ParseEvent::Comment { text, .. } if text.contains("xml")))
2949 );
2950 }
2951
2952 #[test]
2953 fn unterminated_processing_instruction_emits_xml_pi_eof_error() {
2954 let src =
2955 SimpleHtmlEventSource::from_str("t", InputFormat::Xhtml, "<?xml-stylesheet href=\"a\"");
2956 let evs = collect(src);
2957 assert!(evs.iter().any(|e| matches!(
2958 e,
2959 ParseEvent::ParseError { code, .. } if code == "xml.pi_eof"
2960 )));
2961 }
2962
2963 #[test]
2964 fn garbage_after_lt_slash_at_eof_emits_error_and_finishes() {
2965 let src = SimpleHtmlEventSource::from_str("t", InputFormat::Html, "</ ");
2966 let evs = collect(src);
2967 assert!(evs.iter().any(|e| matches!(
2968 e,
2969 ParseEvent::ParseError { code, .. } if code == "html.tokenizer.garbage_after_lt_slash"
2970 )));
2971 assert_eq!(evs.len(), 1);
2972 }
2973
2974 #[test]
2975 fn doctype_end_finder_ignores_gt_inside_quoted_identifiers() {
2976 let src = SimpleHtmlEventSource::from_str(
2977 "t",
2978 InputFormat::Html,
2979 "<!DOCTYPE html SYSTEM \"a> b\"><html></html>",
2980 );
2981 let evs = collect(src);
2982
2983 assert!(evs.iter().any(|e| matches!(
2984 e,
2985 ParseEvent::ParseError { code, .. } if code == "html.tokenizer.doctype.gt_in_system_id"
2986 )));
2987
2988 let system_id = evs
2989 .iter()
2990 .find_map(|e| match e {
2991 ParseEvent::Doctype { system_id, .. } => system_id.as_deref(),
2992 _ => None,
2993 })
2994 .expect("expected doctype event");
2995 assert_eq!(system_id, "a> b");
2996
2997 assert!(
2998 evs.iter()
2999 .any(|e| matches!(e, ParseEvent::StartTag { name, .. } if name == "html"))
3000 );
3001 }
3002
3003 #[test]
3004 fn doctype_system_identifier_gt_and_unclosed_quote_emit_errors() {
3005 let src = SimpleHtmlEventSource::from_str(
3006 "t",
3007 InputFormat::Html,
3008 "<!DOCTYPE html SYSTEM \"a> b><p>x</p>",
3009 );
3010 let evs = collect(src);
3011 assert!(evs.iter().any(|e| matches!(
3012 e,
3013 ParseEvent::ParseError { code, .. } if code == "html.tokenizer.doctype.gt_in_system_id"
3014 )));
3015 assert!(evs.iter().any(|e| matches!(
3016 e,
3017 ParseEvent::ParseError { code, .. } if code == "html.tokenizer.doctype.eof_in_system_id"
3018 )));
3019 }
3020
3021 #[test]
3022 fn doctype_public_identifier_gt_and_unclosed_quote_emit_errors() {
3023 let src = SimpleHtmlEventSource::from_str(
3024 "t",
3025 InputFormat::Html,
3026 "<!DOCTYPE html PUBLIC \"pub\" \"a> b",
3027 );
3028 let evs = collect(src);
3029 assert!(evs.iter().any(|e| matches!(
3030 e,
3031 ParseEvent::ParseError { code, .. } if code == "html.tokenizer.doctype.gt_in_system_id"
3032 )));
3033 assert!(evs.iter().any(|e| matches!(
3034 e,
3035 ParseEvent::ParseError { code, .. } if code == "html.tokenizer.doctype.eof_in_system_id"
3036 )));
3037 }
3038
3039 #[test]
3040 fn simple_event_source_exposes_source_name_and_format() {
3041 let src = SimpleHtmlEventSource::from_str("t", InputFormat::Html, "<p>x</p>");
3042 assert_eq!(src.source_name(), "t");
3043 assert_eq!(src.format(), InputFormat::Html);
3044 }
3045
3046 #[test]
3047 fn scan_next_is_noop_when_finished_is_true() {
3048 let mut src = SimpleHtmlEventSource::from_str("t", InputFormat::Html, "<p>x</p>");
3049 while src.next_event().unwrap().is_some() {}
3050 assert!(src.finished);
3051 src.scan_next().unwrap();
3052 }
3053
3054 #[test]
3055 fn cdata_in_html_inside_svg_is_parsed_as_text() {
3056 let src =
3057 SimpleHtmlEventSource::from_str("t", InputFormat::Html, "<svg><![CDATA[<tag>]]></svg>");
3058 let evs = collect(src);
3059 assert!(
3060 evs.iter()
3061 .any(|e| matches!(e, ParseEvent::Text { text, .. } if text == "<tag>"))
3062 );
3063 }
3064
3065 #[test]
3066 fn end_tag_garbage_and_non_tag_end_sequences_are_coalesced_as_text() {
3067 let src = SimpleHtmlEventSource::from_str("t", InputFormat::Html, "</ x> </1> <p>x</p>");
3068 let evs = collect(src);
3069 assert!(evs.iter().any(|e| matches!(
3070 e,
3071 ParseEvent::ParseError { code, .. } if code == "html.tokenizer.garbage_after_lt_slash"
3072 )));
3073 assert!(
3074 evs.iter()
3075 .any(|e| matches!(e, ParseEvent::Text { text, .. } if text.contains("</1>")))
3076 );
3077 }
3078
3079 #[test]
3080 fn malformed_attribute_syntax_triggers_simple_parser_errors() {
3081 let src = SimpleHtmlEventSource::from_str(
3082 "t",
3083 InputFormat::Html,
3084 "<a href=></a><a title=\"x\"href=\"y\"></a><a /x></a><a ></a><a/>",
3085 );
3086 let evs = collect(src);
3087 assert!(evs.iter().any(|e| matches!(
3088 e,
3089 ParseEvent::ParseError { code, .. } if code == "html.tokenizer.attr_value_missing"
3090 )));
3091 assert!(evs.iter().any(|e| matches!(
3092 e,
3093 ParseEvent::ParseError { code, .. } if code == "html.tokenizer.no_space_between_attrs"
3094 )));
3095 assert!(evs.iter().any(|e| matches!(
3096 e,
3097 ParseEvent::StartTag { name, self_closing, .. } if name == "a" && *self_closing
3098 )));
3099 }
3100
3101 #[test]
3102 fn decode_char_refs_covers_edge_cases_directly() {
3103 assert_eq!(
3104 decode_char_refs(InputFormat::Html, "&".to_string(), false),
3105 "&"
3106 );
3107 assert_eq!(
3108 decode_char_refs(InputFormat::Html, "&#;".to_string(), false),
3109 "&#;"
3110 );
3111 assert_eq!(
3112 decode_char_refs(InputFormat::Html, "&bogus;".to_string(), false),
3113 "&bogus;"
3114 );
3115 assert_eq!(
3116 decode_char_refs(InputFormat::Html, "A".to_string(), false),
3117 "A"
3118 );
3119 assert_eq!(
3120 decode_char_refs(InputFormat::Html, "A".to_string(), false),
3121 "A"
3122 );
3123 assert_eq!(
3124 decode_char_refs(InputFormat::Html, "A".to_string(), false),
3125 "A"
3126 );
3127 assert_eq!(
3128 decode_char_refs(InputFormat::Html, "&#x;".to_string(), false),
3129 "&#x;"
3130 );
3131 assert_eq!(
3132 decode_char_refs(InputFormat::Html, "�".to_string(), false),
3133 "\u{FFFD}"
3134 );
3135
3136 assert_eq!(
3139 decode_char_refs(InputFormat::Html, "©=1".to_string(), true),
3140 "©=1"
3141 );
3142 let (s, errs) = decode_char_refs_with_errors(InputFormat::Xhtml, "©", true, 0, 1, 1);
3143 assert_eq!(s, "©");
3144 assert!(errs.is_empty());
3145
3146 let (s2, errs2) = decode_char_refs_with_errors(InputFormat::Html, "©=1", true, 0, 1, 1);
3147 assert_eq!(s2, "©=1");
3148 assert!(errs2.is_empty());
3149 }
3150
3151 #[test]
3152 fn decode_char_refs_returns_input_string_when_no_refs_present() {
3153 let raw = "plain text".to_string();
3154 let ptr = raw.as_ptr();
3155 let cap = raw.capacity();
3156 let out = decode_char_refs(InputFormat::Html, raw, false);
3157 assert_eq!(out, "plain text");
3158 assert_eq!(out.as_ptr(), ptr);
3159 assert_eq!(out.capacity(), cap);
3160 }
3161
3162 #[test]
3163 fn normalize_name_lowercases_in_html_but_not_in_xhtml() {
3164 let src = SimpleHtmlEventSource::from_str("t", InputFormat::Html, "");
3165 assert_eq!(src.normalize_name("div"), "div");
3166 assert_eq!(src.normalize_name("DiV"), "div");
3167 assert_eq!(src.normalize_name("Ü"), "Ü");
3168 assert_eq!(src.normalize_name("ÜA"), "Üa");
3169
3170 let src = SimpleHtmlEventSource::from_str("t", InputFormat::Xhtml, "");
3171 assert_eq!(src.normalize_name("DiV"), "DiV");
3172 assert_eq!(src.normalize_name("ÜA"), "ÜA");
3173 }
3174
3175 #[test]
3176 fn current_text_mode_kind_applies_only_in_html_namespace() {
3177 let mut src = SimpleHtmlEventSource::from_str("t", InputFormat::Html, "");
3178 assert!(matches!(src.current_text_mode_kind(), TextModeKind::Data));
3179 src.open_elements.push("script".to_string());
3180 src.open_namespaces.push(HtmlNamespace::Html);
3181 assert!(matches!(
3182 src.current_text_mode_kind(),
3183 TextModeKind::RawText
3184 ));
3185
3186 src.open_elements.pop();
3187 src.open_namespaces.pop();
3188 src.open_elements.push("title".to_string());
3189 src.open_namespaces.push(HtmlNamespace::Html);
3190 assert!(matches!(src.current_text_mode_kind(), TextModeKind::RcData));
3191
3192 src.open_elements.pop();
3193 src.open_namespaces.pop();
3194 src.open_elements.push("plaintext".to_string());
3195 src.open_namespaces.push(HtmlNamespace::Html);
3196 assert!(matches!(
3197 src.current_text_mode_kind(),
3198 TextModeKind::Plaintext
3199 ));
3200
3201 src.open_elements.pop();
3202 src.open_namespaces.pop();
3203 src.open_elements.push("script".to_string());
3204 src.open_namespaces.push(HtmlNamespace::Svg);
3205 assert!(matches!(src.current_text_mode_kind(), TextModeKind::Data));
3206 }
3207
3208 #[test]
3209 fn internal_stack_mismatches_fall_back_to_defaults() {
3210 let mut src = SimpleHtmlEventSource::from_str("t", InputFormat::Html, "");
3211
3212 src.open_elements.push("script".to_string());
3213 assert!(matches!(src.current_text_mode_kind(), TextModeKind::Data));
3214 assert_eq!(src.current_insertion_namespace(), HtmlNamespace::Html);
3215
3216 src.open_elements.clear();
3217 src.open_namespaces.push(HtmlNamespace::Html);
3218 assert!(matches!(src.current_text_mode_kind(), TextModeKind::Data));
3219 assert_eq!(src.current_insertion_namespace(), HtmlNamespace::Html);
3220
3221 src.open_namespaces.clear();
3222 src.open_namespaces.push(HtmlNamespace::Svg);
3223 assert!(matches!(src.current_text_mode_kind(), TextModeKind::Data));
3224 assert_eq!(src.current_insertion_namespace(), HtmlNamespace::Svg);
3225 }
3226
3227 #[test]
3228 fn namespace_for_start_tag_respects_current_insertion_namespace() {
3229 let mut src = SimpleHtmlEventSource::from_str("t", InputFormat::Html, "");
3230
3231 assert_eq!(src.namespace_for_start_tag("div"), HtmlNamespace::Html);
3233 assert_eq!(src.namespace_for_start_tag("svg"), HtmlNamespace::Svg);
3234 assert_eq!(src.namespace_for_start_tag("math"), HtmlNamespace::Math);
3235
3236 src.open_elements.push("svg".to_string());
3238 src.open_namespaces.push(HtmlNamespace::Svg);
3239 assert_eq!(src.current_insertion_namespace(), HtmlNamespace::Svg);
3240 assert_eq!(src.namespace_for_start_tag("div"), HtmlNamespace::Svg);
3241 assert_eq!(src.namespace_for_start_tag("math"), HtmlNamespace::Svg);
3242
3243 for tag in ["foreignobject", "desc", "title"] {
3245 src.open_elements.pop();
3246 src.open_namespaces.pop();
3247 src.open_elements.push(tag.to_string());
3248 src.open_namespaces.push(HtmlNamespace::Svg);
3249 assert_eq!(src.current_insertion_namespace(), HtmlNamespace::Html);
3250 assert_eq!(src.namespace_for_start_tag("div"), HtmlNamespace::Html);
3251 assert_eq!(src.namespace_for_start_tag("svg"), HtmlNamespace::Svg);
3252 }
3253
3254 src.open_elements.pop();
3256 src.open_namespaces.pop();
3257 src.open_elements.push("foreignobject".to_string());
3258 src.open_namespaces.push(HtmlNamespace::Html);
3259 assert_eq!(src.current_insertion_namespace(), HtmlNamespace::Html);
3260
3261 src.open_elements.pop();
3263 src.open_namespaces.pop();
3264 src.open_elements.push("math".to_string());
3265 src.open_namespaces.push(HtmlNamespace::Math);
3266 assert_eq!(src.current_insertion_namespace(), HtmlNamespace::Math);
3267 assert_eq!(src.namespace_for_start_tag("div"), HtmlNamespace::Math);
3268 }
3269
3270 #[test]
3271 fn rawtext_and_eof_helpers_cover_edge_branches() {
3272 assert_eq!(
3273 find_rawtext_end_tag(b"</s", 0, "script", InputFormat::Html),
3274 None
3275 );
3276 assert_eq!(
3277 find_rawtext_end_tag(b"</scriptx>", 0, "script", InputFormat::Html),
3278 None
3279 );
3280 assert_eq!(
3281 find_rawtext_end_tag(b"</script>", 0, "script", InputFormat::Html),
3282 Some(0)
3283 );
3284
3285 let (code, _msg) = classify_start_tag_eof(b"a='b");
3286 assert_eq!(code, "html.tokenizer.eof_in_attr_value");
3287 let (code2, _msg2) = classify_start_tag_eof(b"a='b' c");
3288 assert_eq!(code2, "html.tokenizer.eof_in_attr_name");
3289 }
3290
3291 #[test]
3292 fn rcdata_without_closing_tag_decodes_entities_and_finishes() {
3293 let src = SimpleHtmlEventSource::from_str("t", InputFormat::Html, "<textarea>1 < 2");
3294 let evs = collect(src);
3295 assert!(
3296 evs.iter()
3297 .any(|e| matches!(e, ParseEvent::Text { text, .. } if text == "1 < 2"))
3298 );
3299 }
3300
3301 #[test]
3302 fn direct_helpers_cover_remaining_xhtml_and_edge_branches() {
3303 assert_eq!(
3304 decode_char_refs(
3305 InputFormat::Xhtml,
3306 "<>&"'".to_string(),
3307 false
3308 ),
3309 "<>&\"'"
3310 );
3311
3312 let (s, errs) = decode_char_refs_with_errors(InputFormat::Html, "&", false, 0, 1, 1);
3313 assert_eq!(s, "&");
3314 assert!(errs.is_empty());
3315
3316 let (s2, errs2) = decode_char_refs_with_errors(InputFormat::Html, "&#x;", false, 0, 1, 1);
3317 assert_eq!(s2, "&#x;");
3318 assert!(errs2.iter().any(|e| matches!(
3319 e,
3320 ParseEvent::ParseError { code, .. } if code == "html.tokenizer.charref_no_digits"
3321 )));
3322
3323 assert!(starts_with_ascii_case_insensitive(b"do", b""));
3324 assert!(starts_with_ascii_case_insensitive(b"DOCTYPE", b"doctype"));
3325 assert!(!starts_with_ascii_case_insensitive(b"do", b"doctype"));
3326
3327 let src = SimpleHtmlEventSource::from_str("t", InputFormat::Html, "");
3328 let (_name, _attrs, _self_closing, errs3) =
3329 parse_start_tag(&src, "a href=>", 0, 1, 1, 8).unwrap();
3330 assert!(errs3.iter().any(|e| matches!(
3331 e,
3332 ParseEvent::ParseError { code, .. } if code == "html.tokenizer.attr_value_missing"
3333 )));
3334 }
3335
3336 #[test]
3337 fn text_run_decode_errors_are_emitted_before_text_event() {
3338 let evs = collect(SimpleHtmlEventSource::from_str(
3339 "t",
3340 InputFormat::Html,
3341 "&#x;",
3342 ));
3343 let err_i = evs
3344 .iter()
3345 .position(|e| matches!(e, ParseEvent::ParseError { code, .. } if code == "html.tokenizer.charref_no_digits"))
3346 .expect("expected parse error event");
3347 let text_i = evs
3348 .iter()
3349 .position(|e| matches!(e, ParseEvent::Text { text, .. } if text == "&#x;"))
3350 .expect("expected text event");
3351 assert!(err_i < text_i);
3352 }
3353
3354 #[test]
3355 fn html_event_source_covers_variant_dispatch() {
3356 let mut html = HtmlEventSource::from_str("t", InputFormat::Html, "<p>x</p>").unwrap();
3357 assert_eq!(html.source_name(), "t");
3358 assert_eq!(html.format(), InputFormat::Html);
3359 #[cfg(feature = "html5ever")]
3360 assert!(matches!(&html, HtmlEventSource::Html5Ever(_)));
3361 assert!(html.next_event().unwrap().is_some());
3362
3363 let mut xhtml =
3364 HtmlEventSource::from_str("t2", InputFormat::Xhtml, "<?xml-stylesheet href=\"a\"?>")
3365 .unwrap();
3366 assert_eq!(xhtml.source_name(), "t2");
3367 assert_eq!(xhtml.format(), InputFormat::Xhtml);
3368 assert!(matches!(&xhtml, HtmlEventSource::Simple(_)));
3369 assert!(matches!(
3370 xhtml.next_event().unwrap(),
3371 Some(ParseEvent::ProcessingInstruction { .. })
3372 ));
3373 }
3374
3375 #[test]
3376 fn newline_advances_line_and_col_for_end_tags() {
3377 let src = SimpleHtmlEventSource::from_str("t", InputFormat::Html, "<p>\n</p>");
3378 let evs = collect(src);
3379 let span = evs.iter().find_map(|e| match e {
3380 ParseEvent::EndTag { name, span, .. } if name == "p" => *span,
3381 _ => None,
3382 });
3383 let span = span.expect("expected </p> span");
3384 assert_eq!(span.line, 2);
3385 assert_eq!(span.col, 1);
3386 }
3387
3388 #[test]
3389 fn math_namespace_is_tracked_across_nested_tags() {
3390 let src =
3391 SimpleHtmlEventSource::from_str("t", InputFormat::Html, "<math><mi>x</mi></math>");
3392 let evs = collect(src);
3393 assert!(
3394 evs.iter()
3395 .any(|e| matches!(e, ParseEvent::StartTag { name, .. } if name == "math"))
3396 );
3397 assert!(
3398 evs.iter()
3399 .any(|e| matches!(e, ParseEvent::StartTag { name, .. } if name == "mi"))
3400 );
3401 assert!(
3402 evs.iter()
3403 .any(|e| matches!(e, ParseEvent::EndTag { name, .. } if name == "math"))
3404 );
3405 }
3406
3407 #[test]
3408 fn common_bad_sequences_emit_errors_and_text() {
3409 let evs = collect(SimpleHtmlEventSource::from_str(
3410 "t",
3411 InputFormat::Html,
3412 "<></>",
3413 ));
3414 assert!(evs.iter().any(|e| matches!(
3415 e,
3416 ParseEvent::ParseError { code, .. } if code == "html.tokenizer.lt_gt"
3417 )));
3418 assert!(
3419 evs.iter()
3420 .any(|e| matches!(e, ParseEvent::Text { text, .. } if text == "<>"))
3421 );
3422 assert!(evs.iter().any(|e| matches!(
3423 e,
3424 ParseEvent::ParseError { code, .. } if code == "html.tokenizer.lt_slash_gt"
3425 )));
3426 assert!(
3427 evs.iter()
3428 .any(|e| matches!(e, ParseEvent::Text { text, .. } if text == "</>"))
3429 );
3430 }
3431
3432 #[test]
3433 fn comment_parses_text_and_span() {
3434 let evs = collect(SimpleHtmlEventSource::from_str(
3435 "t",
3436 InputFormat::Html,
3437 "<!-- hi -->",
3438 ));
3439 let (text, span) = evs
3440 .iter()
3441 .find_map(|e| match e {
3442 ParseEvent::Comment { text, span } => Some((text.as_str(), *span)),
3443 _ => None,
3444 })
3445 .expect("expected comment event");
3446 assert_eq!(text, " hi ");
3447 assert_eq!(span, Some(Span::new(0, 11, 1, 1)));
3448 }
3449
3450 #[test]
3451 fn comment_eof_emits_expected_parse_error() {
3452 let evs = collect(SimpleHtmlEventSource::from_str(
3453 "t",
3454 InputFormat::Html,
3455 "<!--",
3456 ));
3457 assert!(evs.iter().any(|e| matches!(
3458 e,
3459 ParseEvent::ParseError { code, .. } if code == "html.tokenizer.eof_in_comment"
3460 )));
3461 }
3462
3463 #[test]
3464 fn doctype_emits_additional_vnu_like_syntax_errors() {
3465 let evs = collect(SimpleHtmlEventSource::from_str(
3466 "t",
3467 InputFormat::Html,
3468 "<!DOCTYPEhtml>",
3469 ));
3470 assert!(evs.iter().any(|e| matches!(
3471 e,
3472 ParseEvent::ParseError { code, .. } if code == "html.tokenizer.doctype.missing_space_before_name"
3473 )));
3474
3475 let evs = collect(SimpleHtmlEventSource::from_str(
3476 "t",
3477 InputFormat::Html,
3478 "<!DOCTYPE html PUBLIC\"a\">",
3479 ));
3480 assert!(evs.iter().any(|e| matches!(
3481 e,
3482 ParseEvent::ParseError { code, .. } if code == "html.tokenizer.doctype.no_space_after_public"
3483 )));
3484
3485 let evs = collect(SimpleHtmlEventSource::from_str(
3486 "t",
3487 InputFormat::Html,
3488 "<!DOCTYPE html PUBLIC \"a\"\"b\">",
3489 ));
3490 assert!(evs.iter().any(|e| matches!(
3491 e,
3492 ParseEvent::ParseError { code, .. } if code == "html.tokenizer.doctype.no_space_between_public_system"
3493 )));
3494
3495 let evs = collect(SimpleHtmlEventSource::from_str(
3496 "t",
3497 InputFormat::Html,
3498 "<!DOCTYPE html SYSTEM\"a\">",
3499 ));
3500 assert!(evs.iter().any(|e| matches!(
3501 e,
3502 ParseEvent::ParseError { code, .. } if code == "html.tokenizer.doctype.no_space_after_system"
3503 )));
3504
3505 let evs = collect(SimpleHtmlEventSource::from_str(
3506 "t",
3507 InputFormat::Html,
3508 "<!DOCTYPE html bogus>",
3509 ));
3510 assert!(evs.iter().any(|e| matches!(
3511 e,
3512 ParseEvent::ParseError { code, .. } if code == "html.tokenizer.doctype.bogus"
3513 )));
3514 }
3515
3516 #[test]
3517 fn parse_start_tag_emits_errors_for_malformed_attribute_syntax() {
3518 let evs = collect(SimpleHtmlEventSource::from_str(
3519 "t",
3520 InputFormat::Html,
3521 "<a =></a>",
3522 ));
3523 assert!(evs.iter().any(|e| matches!(
3524 e,
3525 ParseEvent::ParseError { code, .. } if code == "html.tokenizer.equals_expecting_attr_name"
3526 )));
3527
3528 let evs = collect(SimpleHtmlEventSource::from_str(
3529 "t",
3530 InputFormat::Html,
3531 "<a <x=1></a>",
3532 ));
3533 assert!(evs.iter().any(|e| matches!(
3534 e,
3535 ParseEvent::ParseError { code, .. } if code == "html.tokenizer.lt_expecting_attr_name"
3536 )));
3537
3538 let evs = collect(SimpleHtmlEventSource::from_str(
3539 "t",
3540 InputFormat::Html,
3541 "<a x\"y=1></a>",
3542 ));
3543 assert!(evs.iter().any(|e| matches!(
3544 e,
3545 ParseEvent::ParseError { code, .. } if code == "html.tokenizer.quote_in_attr_name"
3546 )));
3547
3548 let evs = collect(SimpleHtmlEventSource::from_str(
3549 "t",
3550 InputFormat::Html,
3551 "<a x<y=1></a>",
3552 ));
3553 assert!(evs.iter().any(|e| matches!(
3554 e,
3555 ParseEvent::ParseError { code, .. } if code == "html.tokenizer.lt_in_attr_name"
3556 )));
3557
3558 let evs = collect(SimpleHtmlEventSource::from_str(
3559 "t",
3560 InputFormat::Html,
3561 "<a id='a' id='b'></a>",
3562 ));
3563 assert!(evs.iter().any(|e| matches!(
3564 e,
3565 ParseEvent::ParseError { code, .. } if code == "html.tokenizer.duplicate_attribute"
3566 )));
3567
3568 let evs = collect(SimpleHtmlEventSource::from_str(
3569 "t",
3570 InputFormat::Html,
3571 "<a ID='a' id='b'></a>",
3572 ));
3573 assert!(evs.iter().any(|e| matches!(
3574 e,
3575 ParseEvent::ParseError { code, .. } if code == "html.tokenizer.duplicate_attribute"
3576 )));
3577 }
3578
3579 #[test]
3580 fn html_normalizes_tag_names_to_ascii_lowercase_but_xhtml_preserves_case() {
3581 let evs = collect(SimpleHtmlEventSource::from_str(
3582 "t",
3583 InputFormat::Html,
3584 "<DIV></DIV>",
3585 ));
3586 assert!(matches!(evs[0], ParseEvent::StartTag { ref name, .. } if name == "div"));
3587 assert!(matches!(evs[1], ParseEvent::EndTag { ref name, .. } if name == "div"));
3588
3589 let evs = collect(SimpleHtmlEventSource::from_str(
3590 "t",
3591 InputFormat::Xhtml,
3592 "<DIV></DIV>",
3593 ));
3594 assert!(matches!(evs[0], ParseEvent::StartTag { ref name, .. } if name == "DIV"));
3595 assert!(matches!(evs[1], ParseEvent::EndTag { ref name, .. } if name == "DIV"));
3596 }
3597
3598 #[test]
3599 fn unquoted_attribute_values_emit_expected_parse_errors() {
3600 let evs = collect(SimpleHtmlEventSource::from_str(
3601 "t",
3602 InputFormat::Html,
3603 "<a x=`y ></a>",
3604 ));
3605 assert!(evs.iter().any(|e| matches!(
3606 e,
3607 ParseEvent::ParseError { code, .. } if code == "html.tokenizer.backtick_at_start_unquoted"
3608 )));
3609
3610 let evs = collect(SimpleHtmlEventSource::from_str(
3611 "t",
3612 InputFormat::Html,
3613 "<a x=y`z ></a>",
3614 ));
3615 assert!(evs.iter().any(|e| matches!(
3616 e,
3617 ParseEvent::ParseError { code, .. } if code == "html.tokenizer.backtick_in_unquoted"
3618 )));
3619
3620 let evs = collect(SimpleHtmlEventSource::from_str(
3621 "t",
3622 InputFormat::Html,
3623 "<a x=<y ></a>",
3624 ));
3625 assert!(evs.iter().any(|e| matches!(
3626 e,
3627 ParseEvent::ParseError { code, .. } if code == "html.tokenizer.lt_at_start_unquoted"
3628 )));
3629
3630 let evs = collect(SimpleHtmlEventSource::from_str(
3631 "t",
3632 InputFormat::Html,
3633 "<a x=y<z ></a>",
3634 ));
3635 assert!(evs.iter().any(|e| matches!(
3636 e,
3637 ParseEvent::ParseError { code, .. } if code == "html.tokenizer.lt_in_unquoted"
3638 )));
3639
3640 let evs = collect(SimpleHtmlEventSource::from_str(
3641 "t",
3642 InputFormat::Html,
3643 "<a x==y ></a>",
3644 ));
3645 assert!(evs.iter().any(|e| matches!(
3646 e,
3647 ParseEvent::ParseError { code, .. } if code == "html.tokenizer.equals_at_start_unquoted"
3648 )));
3649
3650 let evs = collect(SimpleHtmlEventSource::from_str(
3651 "t",
3652 InputFormat::Html,
3653 "<a x=y\"z ></a>",
3654 ));
3655 assert!(evs.iter().any(|e| matches!(
3656 e,
3657 ParseEvent::ParseError { code, .. } if code == "html.tokenizer.quote_in_unquoted"
3658 )));
3659 }
3660
3661 #[test]
3662 fn slash_not_immediately_followed_by_gt_emits_error() {
3663 let evs = collect(SimpleHtmlEventSource::from_str(
3664 "t",
3665 InputFormat::Html,
3666 "<a / ></a>",
3667 ));
3668 assert!(evs.iter().any(|e| matches!(
3669 e,
3670 ParseEvent::ParseError { code, .. } if code == "html.tokenizer.slash_not_immediately_followed_by_gt"
3671 )));
3672 }
3673
3674 #[test]
3675 fn slash_immediately_followed_by_gt_does_not_emit_error() {
3676 let evs = collect(SimpleHtmlEventSource::from_str(
3677 "t",
3678 InputFormat::Html,
3679 "<a /></a>",
3680 ));
3681 assert!(!evs.iter().any(|e| matches!(
3682 e,
3683 ParseEvent::ParseError { code, .. } if code == "html.tokenizer.slash_not_immediately_followed_by_gt"
3684 )));
3685 }
3686
3687 #[test]
3688 fn image_start_tag_emits_error() {
3689 let evs = collect(SimpleHtmlEventSource::from_str(
3690 "t",
3691 InputFormat::Html,
3692 "<image></image>",
3693 ));
3694 assert!(evs.iter().any(|e| matches!(
3695 e,
3696 ParseEvent::ParseError { code, .. } if code == "html.tokenizer.image_start_tag"
3697 )));
3698 }
3699
3700 #[test]
3701 fn rawtext_end_tag_search_is_case_sensitive_in_xhtml() {
3702 assert_eq!(
3703 find_rawtext_end_tag(b"</script>", 0, "script", InputFormat::Xhtml),
3704 Some(0)
3705 );
3706 assert_eq!(
3707 find_rawtext_end_tag(b"</SCRIPT>", 0, "script", InputFormat::Xhtml),
3708 None
3709 );
3710 }
3711
3712 #[test]
3713 fn xhtml_end_tag_matching_is_case_sensitive() {
3714 let src = SimpleHtmlEventSource::from_str("t", InputFormat::Xhtml, "<A></A>");
3715 let evs = collect(src);
3716 assert!(
3717 evs.iter()
3718 .any(|e| matches!(e, ParseEvent::StartTag { name, .. } if name == "A"))
3719 );
3720 assert!(
3721 evs.iter()
3722 .any(|e| matches!(e, ParseEvent::EndTag { name, .. } if name == "A"))
3723 );
3724 }
3725
3726 #[test]
3727 fn numeric_character_references_emit_expected_parse_errors() {
3728 let cases = [
3729 ("�", "html.tokenizer.charref_zero"),
3730 ("�", "html.tokenizer.charref_outside_range"),
3731 ("�", "html.tokenizer.charref_surrogate"),
3732 (" ", "html.tokenizer.charref_cr"),
3733 ("€", "html.tokenizer.charref_c1_controls"),
3734 ("", "html.tokenizer.charref_astral_noncharacter"),
3735 ("", "html.tokenizer.charref_noncharacter"),
3736 ("", "html.tokenizer.charref_unassigned"),
3737 ("", "html.tokenizer.charref_control"),
3738 ];
3739
3740 for (input, expected_code) in cases {
3741 let (_s, errs) = decode_char_refs_with_errors(InputFormat::Html, input, false, 0, 1, 1);
3742 assert!(
3743 errs.iter().any(|e| matches!(
3744 e,
3745 ParseEvent::ParseError { code, .. } if code == expected_code
3746 )),
3747 "missing {expected_code} for {input}"
3748 );
3749 }
3750 }
3751
3752 #[test]
3753 fn decode_char_refs_with_errors_reports_unrecognized_named_refs_as_literal_ampersand() {
3754 let (s, errs) = decode_char_refs_with_errors(InputFormat::Html, "&zzzzzz;", false, 0, 1, 1);
3755 assert_eq!(s, "&zzzzzz;");
3756 assert!(errs.is_empty());
3757 }
3758
3759 #[test]
3760 fn decode_char_refs_with_errors_preserves_stream_errors_when_decoding_refs() {
3761 let input = "a\u{000B}&";
3762 let (s, errs) = decode_char_refs_with_errors(InputFormat::Html, input, false, 0, 1, 1);
3763 assert_eq!(s, "a\u{000B}&");
3764 assert!(errs.iter().any(|e| matches!(
3765 e,
3766 ParseEvent::ParseError { code, .. } if code == "html.tokenizer.forbidden_code_point"
3767 )));
3768 }
3769
3770 #[test]
3771 fn end_tag_syntax_errors_are_reported() {
3772 let evs = collect(SimpleHtmlEventSource::from_str(
3773 "t",
3774 InputFormat::Html,
3775 "</p/>",
3776 ));
3777 assert!(evs.iter().any(|e| matches!(
3778 e,
3779 ParseEvent::ParseError { code, .. } if code == "html.tokenizer.end_tag_stray_slash"
3780 )));
3781
3782 let evs = collect(SimpleHtmlEventSource::from_str(
3783 "t",
3784 InputFormat::Html,
3785 "</p class=x>",
3786 ));
3787 assert!(evs.iter().any(|e| matches!(
3788 e,
3789 ParseEvent::ParseError { code, .. } if code == "html.tokenizer.end_tag_with_attrs"
3790 )));
3791
3792 let evs = collect(SimpleHtmlEventSource::from_str(
3793 "t",
3794 InputFormat::Html,
3795 "</p",
3796 ));
3797 assert!(evs.iter().any(|e| matches!(
3798 e,
3799 ParseEvent::ParseError { code, .. } if code == "html.tokenizer.eof_in_end_tag"
3800 )));
3801 }
3802
3803 #[test]
3804 fn end_tag_with_invalid_utf8_is_lossy_and_emits_end_tag() {
3805 let mut bytes = b"<p></p".to_vec();
3806 bytes.push(0xFF);
3807 bytes.extend_from_slice(b">");
3808
3809 let evs = collect(SimpleHtmlEventSource::from_bytes(
3810 "t",
3811 InputFormat::Html,
3812 bytes,
3813 ));
3814 assert!(evs.iter().any(|e| matches!(
3815 e,
3816 ParseEvent::EndTag { name, .. } if name == "p\u{FFFD}"
3817 )));
3818 }
3819
3820 #[test]
3821 fn text_with_invalid_utf8_is_lossy_and_decodes_char_refs() {
3822 let mut bytes = b"<p>".to_vec();
3823 bytes.push(0xFF);
3824 bytes.extend_from_slice(b"&</p>");
3825 let evs = collect(SimpleHtmlEventSource::from_bytes(
3826 "t",
3827 InputFormat::Html,
3828 bytes,
3829 ));
3830 let texts: Vec<_> = evs
3831 .iter()
3832 .filter_map(|e| match e {
3833 ParseEvent::Text { text, .. } => Some(text.clone()),
3834 _ => None,
3835 })
3836 .collect();
3837 assert_eq!(texts, vec![format!("\u{FFFD}&")]);
3838
3839 let mut bytes = Vec::new();
3840 bytes.push(0xFF);
3841 bytes.extend_from_slice(b"&");
3842 let evs = collect(SimpleHtmlEventSource::from_bytes(
3843 "t",
3844 InputFormat::Html,
3845 bytes,
3846 ));
3847 let texts: Vec<_> = evs
3848 .iter()
3849 .filter_map(|e| match e {
3850 ParseEvent::Text { text, .. } => Some(text.clone()),
3851 _ => None,
3852 })
3853 .collect();
3854 assert_eq!(texts, vec![format!("\u{FFFD}&")]);
3855 }
3856
3857 #[test]
3858 fn start_tag_eof_is_reported_when_tag_close_is_missing() {
3859 let evs = collect(SimpleHtmlEventSource::from_str(
3860 "t",
3861 InputFormat::Html,
3862 "<a href='x'",
3863 ));
3864 assert!(
3865 evs.iter()
3866 .any(|e| matches!(e, ParseEvent::ParseError { .. }))
3867 );
3868 }
3869
3870 #[test]
3871 fn doctype_identifier_edge_cases_cover_public_eof_and_almost_standards_message() {
3872 let evs = collect(SimpleHtmlEventSource::from_str(
3873 "t",
3874 InputFormat::Html,
3875 "<!DOCTYPE html PUBLIC \"a",
3876 ));
3877 assert!(evs.iter().any(|e| matches!(
3878 e,
3879 ParseEvent::ParseError { code, .. } if code == "html.tokenizer.doctype.eof_in_public_id"
3880 )));
3881
3882 let evs = collect(SimpleHtmlEventSource::from_str(
3883 "t",
3884 InputFormat::Html,
3885 "<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" \"http://www.w3.org/TR/html4/loose.dtd\">",
3886 ));
3887 assert!(evs.iter().any(|e| matches!(
3888 e,
3889 ParseEvent::ParseError { code, message, .. }
3890 if code == "html.parser.doctype.not_html5"
3891 && message == "Almost standards mode doctype. Expected “<!DOCTYPE html>”."
3892 )));
3893 }
3894
3895 #[test]
3896 fn doctype_name_is_compared_case_insensitively_for_html5_conformance() {
3897 let evs = collect(SimpleHtmlEventSource::from_str(
3898 "t",
3899 InputFormat::Xhtml,
3900 "<!DOCTYPE HTML>",
3901 ));
3902 assert!(!evs.iter().any(|e| matches!(
3903 e,
3904 ParseEvent::ParseError { code, .. } if code == "html.parser.doctype.not_html5"
3905 )));
3906 assert!(evs.iter().any(|e| matches!(
3907 e,
3908 ParseEvent::Doctype { name: Some(name), .. } if name == "HTML"
3909 )));
3910 }
3911
3912 #[test]
3913 fn find_tag_close_ignores_gt_inside_quotes() {
3914 let bytes = b"<a x='>' y=z>";
3915 assert_eq!(find_tag_close(bytes, 1), Some(bytes.len() - 1));
3916
3917 let bytes = b"<a x=\"a>b\" y=z>";
3918 assert_eq!(find_tag_close(bytes, 1), Some(bytes.len() - 1));
3919
3920 let bytes = b"<a x='>'";
3921 assert_eq!(find_tag_close(bytes, 1), None);
3922 }
3923
3924 #[test]
3925 fn find_tag_close_state_machine_covers_additional_transitions() {
3926 let src = SimpleHtmlEventSource::from_str(
3927 "t",
3928 InputFormat::Html,
3929 "<a x ></a><a x y=z></a><a x= y></a>",
3930 );
3931 let evs = collect(src);
3932 let mut seen_first = false;
3933 let mut seen_second = false;
3934 let mut seen_third = false;
3935 for ev in &evs {
3936 if let Some((name, attrs)) = as_start_tag(ev) {
3937 if name == "a"
3938 && attrs.iter().any(|a| a.name == "x")
3939 && attrs.len() == 1
3940 && !seen_first
3941 {
3942 seen_first = true;
3943 } else if name == "a"
3944 && attrs.iter().any(|a| a.name == "x" && a.value.is_none())
3945 && attrs
3946 .iter()
3947 .any(|a| a.name == "y" && a.value.as_deref() == Some("z"))
3948 {
3949 seen_second = true;
3950 } else if name == "a"
3951 && attrs
3952 .iter()
3953 .any(|a| a.name == "x" && a.value.as_deref() == Some("y"))
3954 {
3955 seen_third = true;
3956 }
3957 }
3958 }
3959 assert!(seen_first);
3960 assert!(seen_second);
3961 assert!(seen_third);
3962 }
3963
3964 #[test]
3965 fn end_tag_prefix_at_eof_emits_eof_after_lt() {
3966 let evs = collect(SimpleHtmlEventSource::from_str(
3967 "t",
3968 InputFormat::Html,
3969 "</",
3970 ));
3971 assert_eq!(evs.len(), 1);
3972 let ParseEvent::ParseError {
3973 code,
3974 message,
3975 span,
3976 } = &evs[0]
3977 else {
3978 panic!("expected a parse error event");
3979 };
3980 assert_eq!(code, "html.tokenizer.eof_after_lt");
3981 assert_eq!(message, "End of file after “<”.");
3982 assert_eq!(span.unwrap(), Span::new(0, 2, 1, 1));
3983 }
3984
3985 #[test]
3986 fn end_tag_garbage_after_lt_slash_emits_error_and_comment() {
3987 let evs = collect(SimpleHtmlEventSource::from_str(
3988 "t",
3989 InputFormat::Html,
3990 "</ x>",
3991 ));
3992 assert_eq!(evs.len(), 2);
3993 assert!(matches!(
3994 &evs[0],
3995 ParseEvent::ParseError { code, .. } if code == "html.tokenizer.garbage_after_lt_slash"
3996 ));
3997 assert!(matches!(&evs[1], ParseEvent::Comment { text, .. } if text == " x"));
3998 }
3999}