#[cfg(test)]
mod tests;
mod character_references;
use crate::input_stream::InputStream;
use annotate_snippets::{AnnotationKind, Level, Snippet};
use character_references::NAMED_CHARACTER_REFERENCES;
use std::char;
use std::collections::VecDeque;
use std::fmt::Display;
use std::ops::Range;
use std::sync::Arc;
const NON_CHARACTERS: [u32; 34] = [
0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF, 0x4FFFE, 0x4FFFF,
0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE, 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF,
0xAFFFE, 0xAFFFF, 0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE, 0xEFFFF,
0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF,
];
type StateFn = fn(&mut Tokenizer);
#[derive(Debug, Clone)]
pub(crate) struct Token {
pub kind: TokenKind,
pub span: Range<usize>,
}
#[derive(Debug, Clone)]
pub(crate) enum TokenKind {
Eof,
Text(String),
Tag(Tag),
Comment(String),
Doctype(Doctype),
}
#[derive(Debug, Clone)]
pub(crate) struct Tag {
pub name: String,
pub kind: TagKind,
pub attributes: Vec<Attribute>,
pub self_closing: bool,
}
#[derive(Debug, Clone)]
pub(crate) struct Attribute {
pub name: String,
pub value: String,
}
#[derive(Debug, Clone)]
pub(crate) enum TagKind {
Start,
End,
}
#[derive(Debug, Clone)]
pub(crate) struct Doctype {
pub name: Option<String>,
pub force_quirks: bool,
pub public_identifier: Option<String>,
pub system_identifier: Option<String>,
}
#[derive(Debug)]
pub(crate) struct TokenizerError {
pub code: ErrorCode,
pub offset: usize,
pub source: Arc<String>,
}
#[derive(Debug)]
pub(crate) enum ErrorCode {
UnexpectedNullCharacter,
UnexpectedQuestionMarkInsteadOfTagName,
UnexpectedEqualsSignBeforeAttributeName,
UnexpectedCharacterInAttributeName,
UnexpectedCharacterInUnquotedAttributeValue,
UnexpectedSolidusInTag,
MissingSemiColonAfterCharacterReference,
MissingEndTagName,
MissingAttributeValue,
MissingWhitespaceBetweenAttributes,
UnknownNamedCharacterReference,
AbsenceOfDigitsInNumericCharacterReference,
AbruptClosingOfEmptyComment,
NullCharacterReference,
CharacterReferenceOutsideUnicodeRange,
SurrogateCharacterRefference,
NonCharacterReference,
ControlCharacterReference,
EofBeforeTagName,
EofInTag,
InvalidFirstCharacterOfTagName,
IncorrectlyOpenedComment,
EofInComment,
NestedComment,
IncorrectlyClosedComment,
EofInDoctype,
MissingWhitespaceBeforeDoctypeName,
MissingDoctypeName,
InvalidCharacterSequenceAfterDoctypeName,
MissingWhitespaceAfterDoctypePublicKeyword,
MissingDoctypePublicIdentifier,
MissingQuoteBeforeDoctypePublicIdentifier,
AbruptDoctypePublicIdentifier,
MissingWhitespaceBetweenDoctypePublicAndSystemIdentifiers,
MissingQuoteBeforeDoctypeSystemIdentifier,
MissingWhitespaceAfterDoctypeSystemKeyword,
MissingDoctypeSystemIdentifier,
AbruptDoctypeSystemIdentifier,
UnexpectedCharacterAfterDoctypeSystemIdentifier,
EofInCdata,
EofInScriptHtmlCommentLikeText,
DuplicateAttribute,
EndTagWithTrailingSolidus,
EndTagWithAttributes,
}
pub(crate) struct Tokenizer {
input_stream: InputStream,
state: StateFn,
return_state: Option<StateFn>,
pub errors: Vec<TokenizerError>,
temporary_buffer: String,
character_reference_code: u32,
current_token: Option<TokenKind>,
last_emitted_start_tag_name: Option<String>,
current_token_start: usize,
token_buffer: VecDeque<Token>,
char_buffer: String,
consumed_as_part_of_attribute: bool,
}
impl Tokenizer {
pub(crate) fn new(html: &str) -> Tokenizer {
let input_stream = InputStream::new(html);
Tokenizer {
input_stream,
state: state_data,
return_state: None,
errors: Vec::new(),
temporary_buffer: String::new(),
character_reference_code: 0,
current_token: None,
last_emitted_start_tag_name: None,
current_token_start: 0,
token_buffer: VecDeque::new(),
char_buffer: String::new(),
consumed_as_part_of_attribute: false,
}
}
pub(crate) fn next_token(&mut self) -> Token {
if let Some(token) = self.token_buffer.pop_front() {
return token;
}
self.run();
self.next_token()
}
#[cfg(test)]
fn set_last_emitted_start_tag_name(&mut self, name: Option<String>) {
self.last_emitted_start_tag_name = name;
}
pub(crate) fn get_input(&self) -> Arc<String> {
self.input_stream.get_input()
}
pub(crate) fn switch_to(&mut self, state: StateFn) {
self.state = state
}
fn consume_match(&mut self, expected: &str) -> bool {
if self.input_stream.lookahead(expected.len()) != expected {
return false;
}
self.input_stream.advance(expected.len());
true
}
fn consume_case_insensitive_match(&mut self, expected: &str) -> bool {
let lookahead = self.input_stream.lookahead(expected.len());
if !lookahead.eq_ignore_ascii_case(expected) {
return false;
}
self.input_stream.advance(expected.len());
true
}
fn reconsume_in(&mut self, state: StateFn) {
self.input_stream.reconsume();
self.state = state;
}
fn reconsume_in_return_state(&mut self) {
let return_state = self.take_return_state();
self.reconsume_in(return_state);
}
fn take_return_state(&mut self) -> StateFn {
debug_assert!(
self.return_state.is_some(),
"return state must be set in {:?}",
self.state
);
self.consumed_as_part_of_attribute = false;
self.return_state.take().unwrap()
}
fn emit_error(&mut self, label: ErrorCode) {
self.errors.push(TokenizerError {
code: label,
offset: self.input_stream.byte_offset(),
source: self.input_stream.get_input(),
})
}
fn emit_token(&mut self, kind: TokenKind) {
if !self.char_buffer.is_empty() {
self.token_buffer.push_back(Token {
kind: TokenKind::Text(self.char_buffer.split_off(0)),
span: self.current_token_start..self.current_token_start + self.char_buffer.len(),
});
self.current_token_start += self.char_buffer.len();
}
self.token_buffer.push_back(Token {
kind,
span: self.current_token_start..self.input_stream.byte_offset(),
});
self.current_token_start = self.input_stream.byte_offset();
}
fn emit_char_token(&mut self, c: char) {
self.char_buffer.push(c);
}
fn emit_eof(&mut self) {
self.emit_token(TokenKind::Eof);
}
fn emit_current_tag_token(&mut self) {
let Some(TokenKind::Tag(mut tag)) = self.current_token.take() else {
panic!("can't emit current tag token in this state");
};
if let TagKind::Start = &tag.kind {
self.last_emitted_start_tag_name = Some(tag.name.clone());
}
if let TagKind::End = &tag.kind {
if tag.self_closing {
self.emit_error(ErrorCode::EndTagWithTrailingSolidus);
}
if !tag.attributes.is_empty() {
self.emit_error(ErrorCode::EndTagWithAttributes);
}
}
let mut unique_attributes = Vec::new();
for attr in tag.attributes {
if unique_attributes
.iter()
.any(|a: &Attribute| a.name == attr.name)
{
self.emit_error(ErrorCode::DuplicateAttribute);
continue;
}
unique_attributes.push(attr);
}
tag.attributes = unique_attributes;
self.emit_token(TokenKind::Tag(tag));
}
fn emit_current_comment_token(&mut self) {
let Some(TokenKind::Comment(c)) = self.current_token.take() else {
panic!("can't emit current comment token in this state");
};
self.emit_token(TokenKind::Comment(c));
}
fn emit_current_doctype_token(&mut self) {
let Some(TokenKind::Doctype(d)) = self.current_token.take() else {
panic!("can't emit current doctype token in this state");
};
self.emit_token(TokenKind::Doctype(d));
}
fn switch_to_return_state(&mut self) {
let return_state = self.take_return_state();
self.switch_to(return_state);
}
fn set_return_state(&mut self, state: StateFn) {
self.return_state = Some(state);
}
fn flush_codepoints_consumed_as_character_reference(&mut self) {
let tmp = std::mem::take(&mut self.temporary_buffer);
let iter = tmp.chars();
if self.consumed_as_part_of_attribute {
self.current_attribute().value.extend(iter);
} else {
for codepoint in iter {
self.emit_char_token(codepoint);
}
}
}
fn current_end_tag_token_is_appropriate(&self) -> bool {
let Some(TokenKind::Tag(Tag {
name,
kind: TagKind::End,
..
})) = &self.current_token
else {
panic!("current token is not an end tag token");
};
Some(name) == self.last_emitted_start_tag_name.as_ref()
}
fn current_tag_token(&mut self) -> &mut Tag {
let Some(TokenKind::Tag(tag)) = self.current_token.as_mut() else {
panic!("can't get current tag token in current state");
};
tag
}
fn current_comment_token(&mut self) -> &mut String {
let Some(TokenKind::Comment(comment)) = self.current_token.as_mut() else {
panic!("can't get current comment token in current state");
};
comment
}
fn current_doctype_token(&mut self) -> &mut Doctype {
let Some(TokenKind::Doctype(doctype)) = self.current_token.as_mut() else {
panic!("can't get current doctype token in current state");
};
doctype
}
fn current_doctype_token_name(&mut self) -> &mut String {
let doctype = self.current_doctype_token();
let Some(name) = doctype.name.as_mut() else {
panic!("can't get current doctype token's name in this state");
};
name
}
fn current_doctype_token_public_identifier(&mut self) -> &mut String {
let doctype = self.current_doctype_token();
let Some(public_id) = doctype.public_identifier.as_mut() else {
panic!("can't get current doctype token's public identifier in this state");
};
public_id
}
fn current_doctype_token_system_identifier(&mut self) -> &mut String {
let doctype = self.current_doctype_token();
let Some(public_id) = doctype.system_identifier.as_mut() else {
panic!("can't get current doctype token's public identifier in this state");
};
public_id
}
fn create_new_tag_token(&mut self, kind: TagKind) {
self.current_token = Some(TokenKind::Tag(Tag {
attributes: Vec::new(),
kind,
name: String::new(),
self_closing: false,
}))
}
fn start_new_attribute(&mut self) {
let tag = self.current_tag_token();
tag.attributes.push(Attribute {
name: String::new(),
value: String::new(),
});
}
fn current_tag_token_name(&mut self) -> &mut String {
&mut self.current_tag_token().name
}
fn current_attribute(&mut self) -> &mut Attribute {
let tag = self.current_tag_token();
debug_assert!(
!tag.attributes.is_empty(),
"current tag token has no attributes"
);
tag.attributes.last_mut().unwrap()
}
fn create_new_comment_token(&mut self) {
self.current_token = Some(TokenKind::Comment(String::new()));
}
fn create_new_doctype_token(&mut self) {
self.current_token = Some(TokenKind::Doctype(Doctype {
name: None,
force_quirks: false,
public_identifier: None,
system_identifier: None,
}))
}
fn run(&mut self) {
while self.token_buffer.is_empty() {
(self.state)(self)
}
}
}
fn state_data(t: &mut Tokenizer) {
match t.input_stream.consume() {
Some('&') => {
t.set_return_state(state_data);
t.switch_to(state_character_reference);
}
Some('<') => {
t.switch_to(state_tag_open);
}
Some(c @ '\x00') => {
t.emit_error(ErrorCode::UnexpectedNullCharacter);
t.emit_char_token(c);
}
None => {
t.emit_eof();
}
Some(c) => {
t.emit_char_token(c);
}
}
}
pub(crate) fn state_rc_data(t: &mut Tokenizer) {
match t.input_stream.consume() {
Some('&') => {
t.set_return_state(state_rc_data);
t.switch_to(state_character_reference);
}
Some('<') => {
t.switch_to(state_rc_data_less_than_sign);
}
Some('\x00') => {
t.emit_error(ErrorCode::UnexpectedNullCharacter);
t.emit_char_token(char::REPLACEMENT_CHARACTER);
}
None => {
t.emit_eof();
}
Some(c) => {
t.emit_char_token(c);
}
}
}
pub(crate) fn state_raw_text(t: &mut Tokenizer) {
match t.input_stream.consume() {
Some('<') => {
t.switch_to(state_raw_text_less_than_sign);
}
Some('\x00') => {
t.emit_error(ErrorCode::UnexpectedNullCharacter);
t.emit_char_token(char::REPLACEMENT_CHARACTER);
}
None => {
t.emit_eof();
}
Some(c) => {
t.emit_char_token(c);
}
}
}
pub(crate) fn state_script_data(t: &mut Tokenizer) {
match t.input_stream.consume() {
Some('<') => {
t.switch_to(state_script_data_less_than_sign);
}
Some('\x00') => {
t.emit_error(ErrorCode::UnexpectedNullCharacter);
t.emit_char_token(char::REPLACEMENT_CHARACTER);
}
None => {
t.emit_eof();
}
Some(c) => {
t.emit_char_token(c);
}
}
}
#[allow(dead_code)]
fn state_plaintext(t: &mut Tokenizer) {
match t.input_stream.consume() {
Some('\x00') => {
t.emit_error(ErrorCode::UnexpectedNullCharacter);
t.emit_char_token(char::REPLACEMENT_CHARACTER);
}
None => {
t.emit_eof();
}
Some(c) => {
t.emit_char_token(c);
}
}
}
fn state_tag_open(t: &mut Tokenizer) {
match t.input_stream.consume() {
Some('!') => {
t.switch_to(state_markup_declaration_open);
}
Some('/') => {
t.switch_to(state_end_tag_open);
}
Some(a) if a.is_ascii_alphabetic() => {
t.create_new_tag_token(TagKind::Start);
t.reconsume_in(state_tag_name);
}
Some('?') => {
t.emit_error(ErrorCode::UnexpectedQuestionMarkInsteadOfTagName);
t.create_new_comment_token();
t.reconsume_in(state_bogus_comment);
}
None => {
t.emit_error(ErrorCode::EofBeforeTagName);
t.emit_char_token('<');
t.emit_eof();
}
_ => {
t.emit_error(ErrorCode::InvalidFirstCharacterOfTagName);
t.emit_char_token('<');
t.reconsume_in(state_data);
}
}
}
fn state_end_tag_open(t: &mut Tokenizer) {
match t.input_stream.consume() {
Some(a) if a.is_ascii_alphabetic() => {
t.create_new_tag_token(TagKind::End);
t.reconsume_in(state_tag_name);
}
Some('>') => {
t.emit_error(ErrorCode::MissingEndTagName);
t.switch_to(state_data);
}
None => {
t.emit_error(ErrorCode::EofBeforeTagName);
t.emit_char_token('<');
t.emit_char_token('/');
t.emit_eof();
}
_ => {
t.emit_error(ErrorCode::InvalidFirstCharacterOfTagName);
t.create_new_comment_token();
t.reconsume_in(state_bogus_comment);
}
}
}
fn state_tag_name(t: &mut Tokenizer) {
match t.input_stream.consume() {
Some('\t') | Some('\n') | Some('\x0C') | Some(' ') => {
t.switch_to(state_before_attribute_name);
}
Some('/') => {
t.switch_to(state_self_closing_start_tag);
}
Some('>') => {
t.switch_to(state_data);
t.emit_current_tag_token();
}
Some(a) if a.is_ascii_uppercase() => {
t.current_tag_token_name().push(a.to_ascii_lowercase())
}
Some('\x00') => {
t.emit_error(ErrorCode::UnexpectedNullCharacter);
t.current_tag_token_name().push(char::REPLACEMENT_CHARACTER)
}
None => {
t.emit_error(ErrorCode::EofInTag);
t.emit_eof();
}
Some(c) => {
t.current_tag_token().name.push(c);
}
}
}
fn state_rc_data_less_than_sign(t: &mut Tokenizer) {
match t.input_stream.consume() {
Some('/') => {
t.temporary_buffer.clear();
t.switch_to(state_rc_data_end_tag_open);
}
_ => {
t.emit_char_token('<');
t.reconsume_in(state_rc_data);
}
}
}
fn state_rc_data_end_tag_open(t: &mut Tokenizer) {
match t.input_stream.consume() {
Some(a) if a.is_ascii_alphabetic() => {
t.create_new_tag_token(TagKind::End);
t.reconsume_in(state_rc_data_end_tag_name);
}
_ => {
t.emit_char_token('<');
t.emit_char_token('/');
t.reconsume_in(state_rc_data);
}
}
}
fn state_rc_data_end_tag_name(t: &mut Tokenizer) {
let c = t.input_stream.consume();
if matches!(c, Some('\t') | Some('\n') | Some('\x0c') | Some(' ')) {
if t.current_end_tag_token_is_appropriate() {
t.switch_to(state_before_attribute_name);
return;
}
} else if c == Some('/') {
if t.current_end_tag_token_is_appropriate() {
t.switch_to(state_self_closing_start_tag);
return;
}
} else if c == Some('>') {
if t.current_end_tag_token_is_appropriate() {
t.switch_to(state_data);
t.emit_current_tag_token();
return;
}
} else if c.is_some_and(|c| c.is_ascii_uppercase()) {
let c = c.unwrap();
t.current_tag_token().name.push(c.to_ascii_lowercase());
t.temporary_buffer.push(c);
return;
} else if c.is_some_and(|c| c.is_ascii_lowercase()) {
let c = c.unwrap();
t.current_tag_token().name.push(c);
t.temporary_buffer.push(c);
return;
}
t.emit_char_token('<');
t.emit_char_token('/');
let mut tmp = std::mem::take(&mut t.temporary_buffer);
for c in tmp.drain(..) {
t.emit_char_token(c);
}
t.reconsume_in(state_rc_data);
}
fn state_raw_text_less_than_sign(t: &mut Tokenizer) {
match t.input_stream.consume() {
Some('/') => {
t.temporary_buffer.clear();
t.switch_to(state_raw_text_end_tag_open);
}
_ => {
t.emit_char_token('<');
t.reconsume_in(state_raw_text);
}
}
}
fn state_raw_text_end_tag_open(t: &mut Tokenizer) {
match t.input_stream.consume() {
Some(a) if a.is_ascii_alphabetic() => {
t.create_new_tag_token(TagKind::End);
t.reconsume_in(state_raw_text_end_tag_name);
}
_ => {
t.emit_char_token('<');
t.emit_char_token('/');
t.reconsume_in(state_raw_text);
}
}
}
fn state_raw_text_end_tag_name(t: &mut Tokenizer) {
let c = t.input_stream.consume();
if matches!(c, Some('\t') | Some('\n') | Some('\x0c') | Some(' ')) {
if t.current_end_tag_token_is_appropriate() {
t.switch_to(state_before_attribute_name);
return;
}
} else if c == Some('/') {
if t.current_end_tag_token_is_appropriate() {
t.switch_to(state_self_closing_start_tag);
return;
}
} else if c == Some('>') {
if t.current_end_tag_token_is_appropriate() {
t.switch_to(state_data);
t.emit_current_tag_token();
return;
}
} else if c.is_some_and(|c| c.is_ascii_uppercase()) {
let c = c.unwrap();
t.current_tag_token().name.push(c.to_ascii_lowercase());
t.temporary_buffer.push(c);
return;
} else if c.is_some_and(|c| c.is_ascii_lowercase()) {
let c = c.unwrap();
t.current_tag_token().name.push(c);
t.temporary_buffer.push(c);
return;
}
t.emit_char_token('<');
t.emit_char_token('/');
let mut tmp = std::mem::take(&mut t.temporary_buffer);
for c in tmp.drain(..) {
t.emit_char_token(c);
}
t.reconsume_in(state_raw_text);
}
fn state_script_data_less_than_sign(t: &mut Tokenizer) {
match t.input_stream.consume() {
Some('/') => {
t.temporary_buffer.clear();
t.switch_to(state_script_data_end_tag_open);
}
Some('!') => {
t.switch_to(state_script_data_escape_start);
t.emit_char_token('<');
t.emit_char_token('!');
}
_ => {
t.emit_char_token('<');
t.reconsume_in(state_script_data);
}
}
}
fn state_script_data_end_tag_open(t: &mut Tokenizer) {
match t.input_stream.consume() {
Some(a) if a.is_ascii_alphabetic() => {
t.create_new_tag_token(TagKind::End);
t.reconsume_in(state_script_data_end_tag_name);
}
_ => {
t.emit_char_token('<');
t.emit_char_token('/');
t.reconsume_in(state_script_data);
}
}
}
fn state_script_data_end_tag_name(t: &mut Tokenizer) {
let c = t.input_stream.consume();
if matches!(c, Some('\t') | Some('\n') | Some('\x0c') | Some(' ')) {
if t.current_end_tag_token_is_appropriate() {
t.switch_to(state_before_attribute_name);
return;
}
} else if c == Some('/') {
if t.current_end_tag_token_is_appropriate() {
t.switch_to(state_self_closing_start_tag);
return;
}
} else if c == Some('>') {
if t.current_end_tag_token_is_appropriate() {
t.switch_to(state_data);
t.emit_current_tag_token();
return;
}
} else if c.is_some_and(|c| c.is_ascii_uppercase()) {
let c = c.unwrap();
t.current_tag_token().name.push(c.to_ascii_lowercase());
t.temporary_buffer.push(c);
return;
} else if c.is_some_and(|c| c.is_ascii_lowercase()) {
let c = c.unwrap();
t.current_tag_token().name.push(c);
t.temporary_buffer.push(c);
return;
}
t.emit_char_token('<');
t.emit_char_token('/');
let mut tmp = std::mem::take(&mut t.temporary_buffer);
for c in tmp.drain(..) {
t.emit_char_token(c);
}
t.reconsume_in(state_script_data);
}
fn state_script_data_escape_start(t: &mut Tokenizer) {
match t.input_stream.consume() {
Some('-') => {
t.switch_to(state_script_data_escape_start_dash);
t.emit_char_token('-');
}
_ => {
t.reconsume_in(state_script_data);
}
}
}
fn state_script_data_escape_start_dash(t: &mut Tokenizer) {
match t.input_stream.consume() {
Some('-') => {
t.switch_to(state_script_data_escaped_dash_dash);
t.emit_char_token('-');
}
_ => {
t.reconsume_in(state_script_data);
}
}
}
fn state_script_data_escaped(t: &mut Tokenizer) {
match t.input_stream.consume() {
Some('-') => {
t.switch_to(state_script_data_escaped_dash);
t.emit_char_token('-');
}
Some('<') => {
t.switch_to(state_script_data_escaped_less_than_sign);
}
Some('\x00') => {
t.emit_error(ErrorCode::UnexpectedNullCharacter);
t.emit_char_token(char::REPLACEMENT_CHARACTER);
}
None => {
t.emit_error(ErrorCode::EofInScriptHtmlCommentLikeText);
t.emit_eof();
}
Some(c) => {
t.emit_char_token(c);
}
}
}
fn state_script_data_escaped_dash(t: &mut Tokenizer) {
match t.input_stream.consume() {
Some('-') => {
t.switch_to(state_script_data_escaped_dash_dash);
t.emit_char_token('-');
}
Some('<') => {
t.switch_to(state_script_data_escaped_less_than_sign);
}
Some('\x00') => {
t.emit_error(ErrorCode::UnexpectedNullCharacter);
t.switch_to(state_script_data_escaped);
t.emit_char_token(char::REPLACEMENT_CHARACTER);
}
None => {
t.emit_error(ErrorCode::EofInScriptHtmlCommentLikeText);
t.emit_eof();
}
Some(c) => {
t.switch_to(state_script_data_escaped);
t.emit_char_token(c);
}
}
}
fn state_script_data_escaped_dash_dash(t: &mut Tokenizer) {
match t.input_stream.consume() {
Some('-') => {
t.emit_char_token('-');
}
Some('<') => {
t.switch_to(state_script_data_escaped_less_than_sign);
}
Some('>') => {
t.switch_to(state_script_data);
t.emit_char_token('>');
}
Some('\x00') => {
t.emit_error(ErrorCode::UnexpectedNullCharacter);
t.switch_to(state_script_data_escaped);
t.emit_char_token(char::REPLACEMENT_CHARACTER);
}
None => {
t.emit_error(ErrorCode::EofInScriptHtmlCommentLikeText);
t.emit_eof();
}
Some(c) => {
t.switch_to(state_script_data_escaped);
t.emit_char_token(c);
}
}
}
fn state_script_data_escaped_less_than_sign(t: &mut Tokenizer) {
match t.input_stream.consume() {
Some('/') => {
t.temporary_buffer.clear();
t.switch_to(state_script_data_escaped_end_tag_open);
}
Some(a) if a.is_ascii_alphabetic() => {
t.temporary_buffer.clear();
t.emit_char_token('<');
t.reconsume_in(state_script_data_double_escape_start);
}
_ => {
t.emit_char_token('<');
t.reconsume_in(state_script_data_escaped);
}
}
}
fn state_script_data_escaped_end_tag_open(t: &mut Tokenizer) {
match t.input_stream.consume() {
Some(a) if a.is_ascii_alphabetic() => {
t.create_new_tag_token(TagKind::End);
t.reconsume_in(state_script_data_escaped_end_tag_name);
}
_ => {
t.emit_char_token('<');
t.emit_char_token('/');
t.reconsume_in(state_script_data_escaped);
}
}
}
fn state_script_data_escaped_end_tag_name(t: &mut Tokenizer) {
let c = t.input_stream.consume();
if matches!(c, Some('\t') | Some('\n') | Some('\x0c') | Some(' ')) {
if t.current_end_tag_token_is_appropriate() {
t.switch_to(state_before_attribute_name);
return;
}
} else if c == Some('/') {
if t.current_end_tag_token_is_appropriate() {
t.switch_to(state_self_closing_start_tag);
return;
}
} else if c == Some('>') {
if t.current_end_tag_token_is_appropriate() {
t.switch_to(state_data);
t.emit_current_tag_token();
return;
}
} else if c.is_some_and(|c| c.is_ascii_uppercase()) {
let c = c.unwrap();
t.current_tag_token().name.push(c.to_ascii_lowercase());
t.temporary_buffer.push(c);
return;
} else if c.is_some_and(|c| c.is_ascii_lowercase()) {
let c = c.unwrap();
t.current_tag_token().name.push(c);
t.temporary_buffer.push(c);
return;
}
t.emit_char_token('<');
t.emit_char_token('/');
let mut tmp = std::mem::take(&mut t.temporary_buffer);
for c in tmp.drain(..) {
t.emit_char_token(c);
}
t.reconsume_in(state_script_data_escaped);
}
fn state_script_data_double_escape_start(t: &mut Tokenizer) {
match t.input_stream.consume() {
Some(c @ '\t') | Some(c @ '\n') | Some(c @ '\x0c') | Some(c @ ' ') | Some(c @ '/')
| Some(c @ '>') => {
if t.temporary_buffer == "script" {
t.switch_to(state_script_data_double_escaped);
} else {
t.switch_to(state_script_data_escaped);
}
t.emit_char_token(c)
}
Some(a) if a.is_ascii_uppercase() => {
t.temporary_buffer.push(a.to_ascii_lowercase());
t.emit_char_token(a);
}
Some(a) if a.is_ascii_lowercase() => {
t.temporary_buffer.push(a);
t.emit_char_token(a);
}
_ => {
t.reconsume_in(state_script_data_escaped);
}
}
}
fn state_script_data_double_escaped(t: &mut Tokenizer) {
match t.input_stream.consume() {
Some('-') => {
t.switch_to(state_script_data_double_escaped_dash);
t.emit_char_token('-');
}
Some('<') => {
t.switch_to(state_script_data_double_escaped_less_than_sign);
t.emit_char_token('<');
}
Some('\x00') => {
t.emit_error(ErrorCode::UnexpectedNullCharacter);
t.emit_char_token(char::REPLACEMENT_CHARACTER);
}
None => {
t.emit_error(ErrorCode::EofInScriptHtmlCommentLikeText);
t.emit_eof();
}
Some(c) => {
t.emit_char_token(c);
}
}
}
fn state_script_data_double_escaped_dash(t: &mut Tokenizer) {
match t.input_stream.consume() {
Some('-') => {
t.switch_to(state_script_data_double_escaped_dash_dash);
t.emit_char_token('-');
}
Some('<') => {
t.switch_to(state_script_data_double_escaped_less_than_sign);
t.emit_char_token('<');
}
Some('\x00') => {
t.emit_error(ErrorCode::UnexpectedNullCharacter);
t.switch_to(state_script_data_double_escaped);
t.emit_char_token(char::REPLACEMENT_CHARACTER);
}
None => {
t.emit_error(ErrorCode::EofInScriptHtmlCommentLikeText);
t.emit_eof();
}
Some(c) => {
t.switch_to(state_script_data_double_escaped);
t.emit_char_token(c);
}
}
}
fn state_script_data_double_escaped_dash_dash(t: &mut Tokenizer) {
match t.input_stream.consume() {
Some('-') => {
t.emit_char_token('-');
}
Some('<') => {
t.switch_to(state_script_data_double_escaped_less_than_sign);
t.emit_char_token('<');
}
Some('>') => {
t.switch_to(state_script_data);
t.emit_char_token('>');
}
Some('\x00') => {
t.emit_error(ErrorCode::UnexpectedNullCharacter);
t.switch_to(state_script_data_double_escaped);
t.emit_char_token(char::REPLACEMENT_CHARACTER);
}
None => {
t.emit_error(ErrorCode::EofInScriptHtmlCommentLikeText);
t.emit_eof();
}
Some(c) => {
t.switch_to(state_script_data_double_escaped);
t.emit_char_token(c);
}
}
}
fn state_script_data_double_escaped_less_than_sign(t: &mut Tokenizer) {
match t.input_stream.consume() {
Some('/') => {
t.temporary_buffer.clear();
t.switch_to(state_script_data_double_escape_end);
t.emit_char_token('/');
}
_ => {
t.reconsume_in(state_script_data_double_escaped);
}
}
}
fn state_script_data_double_escape_end(t: &mut Tokenizer) {
match t.input_stream.consume() {
Some(c @ '\t') | Some(c @ '\n') | Some(c @ '\x0c') | Some(c @ ' ') | Some(c @ '/')
| Some(c @ '>') => {
if t.temporary_buffer == "script" {
t.switch_to(state_script_data_escaped);
} else {
t.switch_to(state_script_data_double_escaped);
}
t.emit_char_token(c)
}
Some(a) if a.is_ascii_uppercase() => {
t.temporary_buffer.push(a.to_ascii_lowercase());
t.emit_char_token(a);
}
Some(a) if a.is_ascii_lowercase() => {
t.temporary_buffer.push(a);
t.emit_char_token(a);
}
_ => {
t.reconsume_in(state_script_data_double_escaped);
}
}
}
fn state_before_attribute_name(t: &mut Tokenizer) {
match t.input_stream.consume() {
Some('\t') | Some('\n') | Some('\x0c') | Some(' ') => {}
Some('/') | Some('>') | None => {
t.reconsume_in(state_after_attribute_name);
}
Some('=') => {
t.emit_error(ErrorCode::UnexpectedEqualsSignBeforeAttributeName);
t.start_new_attribute();
t.current_attribute().name.push('=');
t.switch_to(state_attribute_name);
}
_ => {
t.start_new_attribute();
t.reconsume_in(state_attribute_name);
}
}
}
fn state_attribute_name(t: &mut Tokenizer) {
match t.input_stream.consume() {
Some('\t') | Some('\n') | Some('\x0c') | Some(' ') | Some('/') | Some('>') | None => {
t.reconsume_in(state_after_attribute_name)
}
Some('=') => t.switch_to(state_before_attribute_value),
Some(a) if a.is_ascii_uppercase() => {
t.current_attribute().name.push(a.to_ascii_lowercase());
}
Some('\x00') => {
t.emit_error(ErrorCode::UnexpectedNullCharacter);
t.current_attribute().name.push(char::REPLACEMENT_CHARACTER);
}
Some(c @ '"') | Some(c @ '\'') | Some(c @ '<') => {
t.emit_error(ErrorCode::UnexpectedCharacterInAttributeName);
t.current_attribute().name.push(c)
}
Some(c) => t.current_attribute().name.push(c),
}
}
fn state_after_attribute_name(t: &mut Tokenizer) {
match t.input_stream.consume() {
Some('\t') | Some('\n') | Some('\x0c') | Some(' ') => {}
Some('/') => {
t.switch_to(state_self_closing_start_tag);
}
Some('=') => {
t.switch_to(state_before_attribute_value);
}
Some('>') => {
t.switch_to(state_data);
t.emit_current_tag_token();
}
None => {
t.emit_error(ErrorCode::EofInTag);
t.emit_eof();
}
_ => {
t.start_new_attribute();
t.reconsume_in(state_attribute_name);
}
}
}
fn state_before_attribute_value(t: &mut Tokenizer) {
match t.input_stream.consume() {
Some('\t') | Some('\n') | Some('\x0c') | Some(' ') => {}
Some('"') => {
t.switch_to(state_attribute_value_double_quoted);
}
Some('\'') => {
t.switch_to(state_attribute_value_single_quoted);
}
Some('>') => {
t.emit_error(ErrorCode::MissingAttributeValue);
t.switch_to(state_data);
t.emit_current_tag_token();
}
_ => {
t.reconsume_in(state_attribute_value_unquoted);
}
}
}
fn state_attribute_value_double_quoted(t: &mut Tokenizer) {
match t.input_stream.consume() {
Some('"') => {
t.switch_to(state_after_attribute_value_quoted);
}
Some('&') => {
t.consumed_as_part_of_attribute = true;
t.set_return_state(state_attribute_value_double_quoted);
t.switch_to(state_character_reference);
}
Some('\x00') => {
t.emit_error(ErrorCode::UnexpectedNullCharacter);
t.current_attribute()
.value
.push(char::REPLACEMENT_CHARACTER);
}
None => {
t.emit_error(ErrorCode::EofInTag);
t.emit_eof();
}
Some(c) => {
t.current_attribute().value.push(c);
}
}
}
fn state_attribute_value_single_quoted(t: &mut Tokenizer) {
match t.input_stream.consume() {
Some('\'') => {
t.switch_to(state_after_attribute_value_quoted);
}
Some('&') => {
t.consumed_as_part_of_attribute = true;
t.set_return_state(state_attribute_value_single_quoted);
t.switch_to(state_character_reference);
}
Some('\x00') => {
t.emit_error(ErrorCode::UnexpectedNullCharacter);
t.current_attribute()
.value
.push(char::REPLACEMENT_CHARACTER);
}
None => {
t.emit_error(ErrorCode::EofInTag);
t.emit_eof();
}
Some(c) => t.current_attribute().value.push(c),
}
}
fn state_attribute_value_unquoted(t: &mut Tokenizer) {
match t.input_stream.consume() {
Some('\t') | Some('\n') | Some('\x0c') | Some(' ') => {
t.switch_to(state_before_attribute_name)
}
Some('&') => {
t.consumed_as_part_of_attribute = true;
t.set_return_state(state_attribute_value_unquoted);
t.switch_to(state_character_reference);
}
Some('>') => {
t.switch_to(state_data);
t.emit_current_tag_token();
}
Some('\x00') => {
t.emit_error(ErrorCode::UnexpectedNullCharacter);
t.current_attribute()
.value
.push(char::REPLACEMENT_CHARACTER);
}
Some(c @ '"') | Some(c @ '\'') | Some(c @ '<') | Some(c @ '=') | Some(c @ '`') => {
t.emit_error(ErrorCode::UnexpectedCharacterInUnquotedAttributeValue);
t.current_attribute().value.push(c);
}
None => {
t.emit_error(ErrorCode::EofInTag);
t.emit_eof();
}
Some(c) => {
t.current_attribute().value.push(c);
}
}
}
fn state_after_attribute_value_quoted(t: &mut Tokenizer) {
match t.input_stream.consume() {
Some('\t') | Some('\n') | Some('\x0c') | Some(' ') => {
t.switch_to(state_before_attribute_name)
}
Some('/') => t.switch_to(state_self_closing_start_tag),
Some('>') => {
t.switch_to(state_data);
t.emit_current_tag_token();
}
None => {
t.emit_error(ErrorCode::EofInTag);
t.emit_eof();
}
_ => {
t.emit_error(ErrorCode::MissingWhitespaceBetweenAttributes);
t.reconsume_in(state_before_attribute_name);
}
}
}
fn state_self_closing_start_tag(t: &mut Tokenizer) {
match t.input_stream.consume() {
Some('>') => {
t.current_tag_token().self_closing = true;
t.switch_to(state_data);
t.emit_current_tag_token();
}
None => {
t.emit_error(ErrorCode::EofInTag);
t.emit_eof();
}
_ => {
t.emit_error(ErrorCode::UnexpectedSolidusInTag);
t.reconsume_in(state_before_attribute_name);
}
}
}
fn state_bogus_comment(t: &mut Tokenizer) {
match t.input_stream.consume() {
Some('>') => {
t.switch_to(state_data);
t.emit_current_comment_token();
}
None => {
t.emit_current_comment_token();
t.emit_eof();
}
Some('\x00') => {
t.emit_error(ErrorCode::UnexpectedNullCharacter);
t.current_comment_token().push(char::REPLACEMENT_CHARACTER);
}
Some(c) => {
t.current_comment_token().push(c);
}
}
}
fn state_markup_declaration_open(t: &mut Tokenizer) {
{
if t.consume_match("--") {
t.create_new_comment_token();
t.switch_to(state_comment_start);
} else if t.consume_case_insensitive_match("doctype") {
t.switch_to(state_doctype);
} else if t.consume_case_insensitive_match("[cdata[") {
t.create_new_comment_token();
t.current_comment_token().push_str("[CDATA[");
t.switch_to(state_bogus_comment);
} else {
t.emit_error(ErrorCode::IncorrectlyOpenedComment);
t.create_new_comment_token();
t.switch_to(state_bogus_comment);
}
}
}
fn state_comment_start(t: &mut Tokenizer) {
match t.input_stream.consume() {
Some('-') => {
t.switch_to(state_comment_start_dash);
}
Some('>') => {
t.emit_error(ErrorCode::AbruptClosingOfEmptyComment);
t.switch_to(state_data);
t.emit_current_comment_token();
}
_ => {
t.reconsume_in(state_comment);
}
}
}
fn state_comment_start_dash(t: &mut Tokenizer) {
match t.input_stream.consume() {
Some('-') => {
t.switch_to(state_comment_end);
}
Some('>') => {
t.emit_error(ErrorCode::AbruptClosingOfEmptyComment);
t.switch_to(state_data);
t.emit_current_comment_token();
}
None => {
t.emit_error(ErrorCode::EofInComment);
t.emit_current_comment_token();
t.emit_eof();
}
_ => {
t.current_comment_token().push('-');
t.reconsume_in(state_comment)
}
}
}
fn state_comment(t: &mut Tokenizer) {
match t.input_stream.consume() {
Some(c @ '<') => {
t.current_comment_token().push(c);
t.switch_to(state_comment_less_than_sign);
}
Some('-') => {
t.switch_to(state_comment_end_dash);
}
Some('\x00') => {
t.emit_error(ErrorCode::UnexpectedNullCharacter);
t.current_comment_token().push(char::REPLACEMENT_CHARACTER)
}
None => {
t.emit_error(ErrorCode::EofInComment);
t.emit_current_comment_token();
t.emit_eof();
}
Some(c) => {
t.current_comment_token().push(c);
}
}
}
fn state_comment_less_than_sign(t: &mut Tokenizer) {
match t.input_stream.consume() {
Some(c @ '!') => {
t.current_comment_token().push(c);
t.switch_to(state_comment_less_than_sign_bang);
}
Some(c @ '<') => {
t.current_comment_token().push(c);
}
_ => {
t.reconsume_in(state_comment);
}
}
}
fn state_comment_less_than_sign_bang(t: &mut Tokenizer) {
match t.input_stream.consume() {
Some('-') => {
t.switch_to(state_comment_less_than_sign_bang_dash);
}
_ => {
t.reconsume_in(state_comment);
}
}
}
fn state_comment_less_than_sign_bang_dash(t: &mut Tokenizer) {
match t.input_stream.consume() {
Some('-') => t.switch_to(state_comment_less_than_sign_bang_dash_dash),
_ => t.reconsume_in(state_comment_end_dash),
}
}
fn state_comment_less_than_sign_bang_dash_dash(t: &mut Tokenizer) {
match t.input_stream.consume() {
Some('>') | None => {
t.reconsume_in(state_comment_end);
}
_ => {
t.emit_error(ErrorCode::NestedComment);
t.reconsume_in(state_comment_end);
}
}
}
fn state_comment_end_dash(t: &mut Tokenizer) {
match t.input_stream.consume() {
Some('-') => {
t.switch_to(state_comment_end);
}
None => {
t.emit_error(ErrorCode::EofInComment);
t.emit_current_comment_token();
t.emit_eof();
}
_ => {
t.current_comment_token().push('-');
t.reconsume_in(state_comment);
}
}
}
fn state_comment_end(t: &mut Tokenizer) {
match t.input_stream.consume() {
Some('>') => {
t.switch_to(state_data);
t.emit_current_comment_token();
}
Some('!') => {
t.switch_to(state_comment_end_bang);
}
Some('-') => t.current_comment_token().push('-'),
None => {
t.emit_error(ErrorCode::EofInComment);
t.emit_current_comment_token();
t.emit_eof();
}
_ => {
t.current_comment_token().push_str("--");
t.reconsume_in(state_comment);
}
}
}
fn state_comment_end_bang(t: &mut Tokenizer) {
match t.input_stream.consume() {
Some('-') => {
t.current_comment_token().push_str("--!");
t.switch_to(state_comment_end_dash);
}
Some('>') => {
t.emit_error(ErrorCode::IncorrectlyClosedComment);
t.switch_to(state_data);
t.emit_current_comment_token();
}
None => {
t.emit_error(ErrorCode::EofInComment);
t.emit_current_comment_token();
t.emit_eof();
}
_ => {
t.current_comment_token().push_str("--!");
t.reconsume_in(state_comment);
}
}
}
fn state_doctype(t: &mut Tokenizer) {
match t.input_stream.consume() {
Some('\t') | Some('\n') | Some('\x0c') | Some(' ') => {
t.switch_to(state_before_doctype_name);
}
Some('>') => {
t.reconsume_in(state_before_doctype_name);
}
None => {
t.emit_error(ErrorCode::EofInDoctype);
t.create_new_doctype_token();
t.current_doctype_token().force_quirks = true;
t.emit_current_doctype_token();
t.emit_eof();
}
_ => {
t.emit_error(ErrorCode::MissingWhitespaceBeforeDoctypeName);
t.reconsume_in(state_before_doctype_name);
}
}
}
fn state_before_doctype_name(t: &mut Tokenizer) {
match t.input_stream.consume() {
Some('\t') | Some('\n') | Some('\x0c') | Some(' ') => {}
Some(a) if a.is_ascii_uppercase() => {
t.create_new_doctype_token();
t.current_doctype_token().name = Some(String::from(a.to_ascii_lowercase()));
t.switch_to(state_doctype_name);
}
Some('\x00') => {
t.emit_error(ErrorCode::UnexpectedNullCharacter);
t.create_new_doctype_token();
t.current_doctype_token().name = Some(String::from(char::REPLACEMENT_CHARACTER));
t.switch_to(state_doctype_name);
}
Some('>') => {
t.emit_error(ErrorCode::MissingDoctypeName);
t.create_new_doctype_token();
t.current_doctype_token().force_quirks = true;
t.switch_to(state_data);
t.emit_current_doctype_token();
}
None => {
t.emit_error(ErrorCode::EofInDoctype);
t.create_new_doctype_token();
t.current_doctype_token().force_quirks = true;
t.emit_current_doctype_token();
t.emit_eof();
}
Some(c) => {
t.create_new_doctype_token();
t.current_doctype_token().name = Some(String::from(c));
t.switch_to(state_doctype_name);
}
}
}
fn state_doctype_name(t: &mut Tokenizer) {
match t.input_stream.consume() {
Some('\t') | Some('\n') | Some('\x0c') | Some(' ') => {
t.switch_to(state_after_doctype_name);
}
Some('>') => {
t.switch_to(state_data);
t.emit_current_doctype_token();
}
Some(a) if a.is_ascii_uppercase() => {
t.current_doctype_token_name().push(a.to_ascii_lowercase())
}
Some('\x00') => {
t.emit_error(ErrorCode::UnexpectedNullCharacter);
t.current_doctype_token_name()
.push(char::REPLACEMENT_CHARACTER);
}
None => {
t.emit_error(ErrorCode::EofInDoctype);
t.current_doctype_token().force_quirks = true;
t.emit_current_doctype_token();
t.emit_eof();
}
Some(c) => {
t.current_doctype_token_name().push(c);
}
}
}
fn state_after_doctype_name(t: &mut Tokenizer) {
match t.input_stream.consume() {
Some('\t') | Some('\n') | Some('\x0c') | Some(' ') => {}
Some('>') => {
t.switch_to(state_data);
t.emit_current_doctype_token();
}
None => {
t.emit_error(ErrorCode::EofInDoctype);
t.current_doctype_token().force_quirks = true;
t.emit_current_doctype_token();
t.emit_eof();
}
Some('P') | Some('p') if t.consume_case_insensitive_match("ublic") => {
t.switch_to(state_after_doctype_public_keyword);
}
Some('S') | Some('s') if t.consume_case_insensitive_match("ystem") => {
t.switch_to(state_after_doctype_system_keyword);
}
_ => {
t.emit_error(ErrorCode::InvalidCharacterSequenceAfterDoctypeName);
t.current_doctype_token().force_quirks = true;
t.reconsume_in(state_bogus_doctype);
}
}
}
fn state_after_doctype_public_keyword(t: &mut Tokenizer) {
match t.input_stream.consume() {
Some('\t') | Some('\n') | Some('\x0c') | Some(' ') => {
t.switch_to(state_before_doctype_public_identifier)
}
Some('"') => {
t.emit_error(ErrorCode::MissingWhitespaceAfterDoctypePublicKeyword);
t.current_doctype_token().public_identifier = Some(String::new());
t.switch_to(state_doctype_public_identifier_double_quoted);
}
Some('\'') => {
t.emit_error(ErrorCode::MissingWhitespaceAfterDoctypePublicKeyword);
t.current_doctype_token().public_identifier = Some(String::new());
t.switch_to(state_doctype_public_identifier_single_quoted);
}
Some('>') => {
t.emit_error(ErrorCode::MissingDoctypePublicIdentifier);
t.current_doctype_token().force_quirks = true;
t.switch_to(state_data);
t.emit_current_doctype_token();
}
None => {
t.emit_error(ErrorCode::EofInDoctype);
t.current_doctype_token().force_quirks = true;
t.emit_current_doctype_token();
t.emit_eof();
}
_ => {
t.emit_error(ErrorCode::MissingQuoteBeforeDoctypePublicIdentifier);
t.current_doctype_token().force_quirks = true;
t.reconsume_in(state_bogus_doctype);
}
}
}
fn state_before_doctype_public_identifier(t: &mut Tokenizer) {
match t.input_stream.consume() {
Some('\t') | Some('\n') | Some('\x0c') | Some(' ') => {}
Some('"') => {
t.current_doctype_token().public_identifier = Some(String::new());
t.switch_to(state_doctype_public_identifier_double_quoted);
}
Some('\'') => {
t.current_doctype_token().public_identifier = Some(String::new());
t.switch_to(state_doctype_public_identifier_single_quoted);
}
Some('>') => {
t.emit_error(ErrorCode::MissingDoctypePublicIdentifier);
t.current_doctype_token().force_quirks = true;
t.switch_to(state_data);
t.emit_current_doctype_token();
}
None => {
t.emit_error(ErrorCode::EofInDoctype);
t.current_doctype_token().force_quirks = true;
t.emit_current_doctype_token();
t.emit_eof();
}
_ => {
t.emit_error(ErrorCode::MissingQuoteBeforeDoctypePublicIdentifier);
t.current_doctype_token().force_quirks = true;
t.reconsume_in(state_bogus_doctype);
}
}
}
fn state_doctype_public_identifier_double_quoted(t: &mut Tokenizer) {
match t.input_stream.consume() {
Some('"') => {
t.switch_to(state_after_doctype_public_identifier);
}
Some('\x00') => {
t.emit_error(ErrorCode::UnexpectedNullCharacter);
t.current_doctype_token_public_identifier()
.push(char::REPLACEMENT_CHARACTER);
}
Some('>') => {
t.emit_error(ErrorCode::AbruptDoctypePublicIdentifier);
t.current_doctype_token().force_quirks = true;
t.switch_to(state_data);
t.emit_current_doctype_token();
}
None => {
t.emit_error(ErrorCode::EofInDoctype);
t.current_doctype_token().force_quirks = true;
t.emit_current_doctype_token();
t.emit_eof();
}
Some(c) => {
t.current_doctype_token_public_identifier().push(c);
}
}
}
fn state_doctype_public_identifier_single_quoted(t: &mut Tokenizer) {
match t.input_stream.consume() {
Some('\'') => {
t.switch_to(state_after_doctype_public_identifier);
}
Some('\x00') => {
t.emit_error(ErrorCode::UnexpectedNullCharacter);
t.current_doctype_token_public_identifier()
.push(char::REPLACEMENT_CHARACTER);
}
Some('>') => {
t.emit_error(ErrorCode::AbruptDoctypePublicIdentifier);
t.current_doctype_token().force_quirks = true;
t.switch_to(state_data);
t.emit_current_doctype_token();
}
None => {
t.emit_error(ErrorCode::EofInDoctype);
t.current_doctype_token().force_quirks = true;
t.emit_current_doctype_token();
t.emit_eof();
}
Some(c) => {
t.current_doctype_token_public_identifier().push(c);
}
}
}
fn state_after_doctype_public_identifier(t: &mut Tokenizer) {
{
match t.input_stream.consume() {
Some('\t') | Some('\n') | Some('\x0c') | Some(' ') => {
t.switch_to(state_between_doctype_public_and_system_identifiers);
}
Some('>') => {
t.switch_to(state_data);
t.emit_current_doctype_token();
}
Some('"') => {
t.emit_error(ErrorCode::MissingWhitespaceBetweenDoctypePublicAndSystemIdentifiers);
t.current_doctype_token().system_identifier = Some(String::new());
t.switch_to(state_doctype_system_identifier_double_quoted);
}
Some('\'') => {
t.emit_error(ErrorCode::MissingWhitespaceBetweenDoctypePublicAndSystemIdentifiers);
t.current_doctype_token().system_identifier = Some(String::new());
t.switch_to(state_doctype_system_identifier_single_quoted);
}
None => {
t.emit_error(ErrorCode::EofInDoctype);
t.current_doctype_token().force_quirks = true;
t.emit_current_doctype_token();
t.emit_eof();
}
_ => {
t.emit_error(ErrorCode::MissingQuoteBeforeDoctypeSystemIdentifier);
t.current_doctype_token().force_quirks = true;
t.reconsume_in(state_bogus_doctype);
}
}
}
}
fn state_between_doctype_public_and_system_identifiers(t: &mut Tokenizer) {
match t.input_stream.consume() {
Some('\t') | Some('\n') | Some('\x0c') | Some(' ') => {}
Some('>') => {
t.switch_to(state_data);
t.emit_current_doctype_token();
}
Some('"') => {
t.current_doctype_token().system_identifier = Some(String::new());
t.switch_to(state_doctype_system_identifier_double_quoted);
}
Some('\'') => {
t.current_doctype_token().system_identifier = Some(String::new());
t.switch_to(state_doctype_system_identifier_single_quoted);
}
None => {
t.emit_error(ErrorCode::EofInDoctype);
t.current_doctype_token().force_quirks = true;
t.emit_current_doctype_token();
t.emit_eof();
}
_ => {
t.emit_error(ErrorCode::MissingQuoteBeforeDoctypeSystemIdentifier);
t.current_doctype_token().force_quirks = true;
t.reconsume_in(state_bogus_doctype);
}
}
}
fn state_after_doctype_system_keyword(t: &mut Tokenizer) {
match t.input_stream.consume() {
Some('\t') | Some('\n') | Some('\x0c') | Some(' ') => {
t.switch_to(state_before_doctype_system_identifier);
}
Some('"') => {
t.emit_error(ErrorCode::MissingWhitespaceAfterDoctypeSystemKeyword);
t.current_doctype_token().system_identifier = Some(String::new());
t.switch_to(state_doctype_system_identifier_double_quoted);
}
Some('\'') => {
t.emit_error(ErrorCode::MissingWhitespaceAfterDoctypeSystemKeyword);
t.current_doctype_token().system_identifier = Some(String::new());
t.switch_to(state_doctype_system_identifier_single_quoted);
}
Some('>') => {
t.emit_error(ErrorCode::MissingDoctypeSystemIdentifier);
t.current_doctype_token().force_quirks = true;
t.switch_to(state_data);
t.emit_current_doctype_token();
}
None => {
t.emit_error(ErrorCode::EofInDoctype);
t.current_doctype_token().force_quirks = true;
t.emit_current_doctype_token();
t.emit_eof();
}
_ => {
t.emit_error(ErrorCode::MissingQuoteBeforeDoctypeSystemIdentifier);
t.current_doctype_token().force_quirks = true;
t.reconsume_in(state_bogus_doctype);
}
}
}
fn state_before_doctype_system_identifier(t: &mut Tokenizer) {
match t.input_stream.consume() {
Some('\t') | Some('\n') | Some('\x0c') | Some(' ') => {}
Some('"') => {
t.current_doctype_token().system_identifier = Some(String::new());
t.switch_to(state_doctype_system_identifier_double_quoted);
}
Some('\'') => {
t.current_doctype_token().system_identifier = Some(String::new());
t.switch_to(state_doctype_system_identifier_single_quoted);
}
Some('>') => {
t.emit_error(ErrorCode::MissingDoctypeSystemIdentifier);
t.current_doctype_token().force_quirks = true;
t.switch_to(state_data);
t.emit_current_doctype_token();
}
None => {
t.emit_error(ErrorCode::EofInDoctype);
t.current_doctype_token().force_quirks = true;
t.emit_current_doctype_token();
t.emit_eof();
}
_ => {
t.emit_error(ErrorCode::MissingQuoteBeforeDoctypeSystemIdentifier);
t.current_doctype_token().force_quirks = true;
t.reconsume_in(state_bogus_doctype);
}
}
}
fn state_doctype_system_identifier_double_quoted(t: &mut Tokenizer) {
match t.input_stream.consume() {
Some('"') => {
t.switch_to(state_after_doctype_system_identifier);
}
Some('\x00') => {
t.emit_error(ErrorCode::UnexpectedNullCharacter);
t.current_doctype_token_system_identifier()
.push(char::REPLACEMENT_CHARACTER);
}
Some('>') => {
t.emit_error(ErrorCode::AbruptDoctypeSystemIdentifier);
t.current_doctype_token().force_quirks = true;
t.switch_to(state_data);
t.emit_current_doctype_token();
}
None => {
t.emit_error(ErrorCode::EofInDoctype);
t.current_doctype_token().force_quirks = true;
t.emit_current_doctype_token();
t.emit_eof();
}
Some(c) => {
t.current_doctype_token_system_identifier().push(c);
}
}
}
fn state_doctype_system_identifier_single_quoted(t: &mut Tokenizer) {
match t.input_stream.consume() {
Some('\'') => {
t.switch_to(state_after_doctype_system_identifier);
}
Some('\x00') => {
t.emit_error(ErrorCode::UnexpectedNullCharacter);
t.current_doctype_token_system_identifier()
.push(char::REPLACEMENT_CHARACTER);
}
Some('>') => {
t.emit_error(ErrorCode::AbruptDoctypeSystemIdentifier);
t.current_doctype_token().force_quirks = true;
t.switch_to(state_data);
t.emit_current_doctype_token();
}
None => {
t.emit_error(ErrorCode::EofInDoctype);
t.current_doctype_token().force_quirks = true;
t.emit_current_doctype_token();
t.emit_eof();
}
Some(c) => {
t.current_doctype_token_system_identifier().push(c);
}
}
}
fn state_after_doctype_system_identifier(t: &mut Tokenizer) {
match t.input_stream.consume() {
Some('\t') | Some('\n') | Some('\x0c') | Some(' ') => {}
Some('>') => {
t.switch_to(state_data);
t.emit_current_doctype_token();
}
None => {
t.emit_error(ErrorCode::EofInDoctype);
t.current_doctype_token().force_quirks = true;
t.emit_current_doctype_token();
t.emit_eof();
}
_ => {
t.emit_error(ErrorCode::UnexpectedCharacterAfterDoctypeSystemIdentifier);
t.reconsume_in(state_bogus_doctype);
}
}
}
fn state_bogus_doctype(t: &mut Tokenizer) {
match t.input_stream.consume() {
Some('>') => {
t.switch_to(state_data);
t.emit_current_doctype_token();
}
Some('\x00') => {
t.emit_error(ErrorCode::UnexpectedNullCharacter);
}
None => {
t.emit_current_doctype_token();
t.emit_eof();
}
_ => {}
}
}
#[allow(dead_code)]
fn state_cdata_section(t: &mut Tokenizer) {
match t.input_stream.consume() {
Some(']') => {
t.switch_to(state_cdata_section_bracket);
}
None => {
t.emit_error(ErrorCode::EofInCdata);
t.emit_eof();
}
Some(c) => {
t.emit_char_token(c);
}
}
}
fn state_cdata_section_bracket(t: &mut Tokenizer) {
match t.input_stream.consume() {
Some(']') => {
t.switch_to(state_cdata_section_end);
}
_ => {
t.emit_char_token(']');
t.reconsume_in(state_cdata_section);
}
}
}
fn state_cdata_section_end(t: &mut Tokenizer) {
match t.input_stream.consume() {
Some(']') => {
t.emit_char_token(']');
}
Some('>') => {
t.switch_to(state_data);
}
_ => {
t.emit_char_token(']');
t.emit_char_token(']');
t.reconsume_in(state_cdata_section);
}
}
}
fn state_character_reference(t: &mut Tokenizer) {
t.temporary_buffer.clear();
t.temporary_buffer.push('&');
match t.input_stream.consume() {
Some(a) if a.is_alphanumeric() => {
t.reconsume_in(state_named_character_reference);
}
Some(c @ '#') => {
t.temporary_buffer.push(c);
t.switch_to(state_numeric_character_reference);
}
_ => {
t.flush_codepoints_consumed_as_character_reference();
t.reconsume_in_return_state();
}
}
}
fn state_named_character_reference(t: &mut Tokenizer) {
let mut last_match = None;
while let Some(c) = t.input_stream.consume() {
t.temporary_buffer.push(c);
let num_prefix_matches = NAMED_CHARACTER_REFERENCES
.keys()
.filter(|&k| k.starts_with(t.temporary_buffer.as_str()))
.count();
if num_prefix_matches == 0 {
break;
}
if let Some(result) = NAMED_CHARACTER_REFERENCES.get(t.temporary_buffer.as_str()) {
last_match = Some((t.temporary_buffer.len(), c, result));
t.input_stream.mark();
}
}
if let Some((last_match_len, last_matched_char, result)) = last_match {
t.input_stream.rewind();
t.temporary_buffer.truncate(last_match_len);
if t.consumed_as_part_of_attribute
&& last_matched_char != ';'
&& (t.input_stream.peek() == Some('=')
|| t.input_stream.peek().is_some_and(|c| c.is_alphanumeric()))
{
t.flush_codepoints_consumed_as_character_reference();
t.switch_to_return_state();
} else {
if last_matched_char != ';' {
t.emit_error(ErrorCode::MissingSemiColonAfterCharacterReference);
}
t.temporary_buffer.clear();
t.temporary_buffer.push_str(result);
t.flush_codepoints_consumed_as_character_reference();
t.switch_to_return_state();
}
} else {
t.flush_codepoints_consumed_as_character_reference();
t.switch_to(state_ambiguous_ampersand);
}
}
fn state_ambiguous_ampersand(t: &mut Tokenizer) {
match t.input_stream.consume() {
Some(a) if a.is_alphanumeric() => {
if t.consumed_as_part_of_attribute {
t.current_attribute().value.push(a);
} else {
t.emit_char_token(a);
}
}
Some(';') => {
t.emit_error(ErrorCode::UnknownNamedCharacterReference);
t.reconsume_in_return_state();
}
_ => {
t.reconsume_in_return_state();
}
}
}
fn state_numeric_character_reference(t: &mut Tokenizer) {
{
t.character_reference_code = 0;
match t.input_stream.consume() {
Some(c @ 'x') | Some(c @ 'X') => {
t.temporary_buffer.push(c);
t.switch_to(state_hexadecimal_character_reference_start);
}
_ => {
t.reconsume_in(state_decimal_character_reference_start);
}
}
}
}
fn state_hexadecimal_character_reference_start(t: &mut Tokenizer) {
match t.input_stream.consume() {
Some(a) if a.is_ascii_hexdigit() => {
t.reconsume_in(state_hexadecimal_character_reference);
}
_ => {
t.emit_error(ErrorCode::AbsenceOfDigitsInNumericCharacterReference);
t.flush_codepoints_consumed_as_character_reference();
t.reconsume_in_return_state();
}
}
}
fn state_decimal_character_reference_start(t: &mut Tokenizer) {
match t.input_stream.consume() {
Some(a) if a.is_ascii_digit() => {
t.reconsume_in(state_decimal_character_reference);
}
_ => {
t.emit_error(ErrorCode::AbsenceOfDigitsInNumericCharacterReference);
t.flush_codepoints_consumed_as_character_reference();
t.reconsume_in_return_state();
}
}
}
fn state_hexadecimal_character_reference(t: &mut Tokenizer) {
match t.input_stream.consume() {
Some(a) if a.is_ascii_hexdigit() => {
t.character_reference_code = t.character_reference_code.saturating_mul(16);
t.character_reference_code = t
.character_reference_code
.saturating_add(a.to_digit(16).unwrap());
}
Some(';') => {
t.switch_to(state_numeric_character_reference_end);
}
_ => {
t.emit_error(ErrorCode::MissingSemiColonAfterCharacterReference);
t.reconsume_in(state_numeric_character_reference_end);
}
}
}
fn state_decimal_character_reference(t: &mut Tokenizer) {
match t.input_stream.consume() {
Some(a) if a.is_ascii_digit() => {
t.character_reference_code = t.character_reference_code.saturating_mul(10);
t.character_reference_code = t
.character_reference_code
.saturating_add(a.to_digit(10).unwrap());
}
Some(';') => t.switch_to(state_numeric_character_reference_end),
_ => {
t.emit_error(ErrorCode::MissingSemiColonAfterCharacterReference);
t.reconsume_in(state_numeric_character_reference_end);
}
}
}
fn state_numeric_character_reference_end(t: &mut Tokenizer) {
{
let mut code = t.character_reference_code;
if code == 0 {
t.emit_error(ErrorCode::NullCharacterReference);
code = 0xfffd;
}
if code > 0x10FFFF {
t.emit_error(ErrorCode::CharacterReferenceOutsideUnicodeRange);
code = 0xfffd;
}
if (0xd800..=0xdbff).contains(&code) || (0xdc00..=0xdfff).contains(&code) {
t.emit_error(ErrorCode::SurrogateCharacterRefference);
code = 0xfffd;
}
if (0xfdd0..=0xfdef).contains(&code) || NON_CHARACTERS.contains(&code) {
t.emit_error(ErrorCode::NonCharacterReference);
}
if code == 0x0d || (is_control(code) && !is_ascii_whitespace(code)) {
t.emit_error(ErrorCode::ControlCharacterReference);
code = match code {
0x80 => 0x20AC, 0x82 => 0x201A, 0x83 => 0x0192, 0x84 => 0x201E, 0x85 => 0x2026, 0x86 => 0x2020, 0x87 => 0x2021, 0x88 => 0x02C6, 0x89 => 0x2030, 0x8A => 0x0160, 0x8B => 0x2039, 0x8C => 0x0152, 0x8E => 0x017D, 0x91 => 0x2018, 0x92 => 0x2019, 0x93 => 0x201C, 0x94 => 0x201D, 0x95 => 0x2022, 0x96 => 0x2013, 0x97 => 0x2014, 0x98 => 0x02DC, 0x99 => 0x2122, 0x9A => 0x0161, 0x9B => 0x203A, 0x9C => 0x0153, 0x9E => 0x017E, 0x9F => 0x0178, _ => code,
};
}
t.temporary_buffer.clear();
t.temporary_buffer.push(char::from_u32(code).unwrap());
t.flush_codepoints_consumed_as_character_reference();
t.switch_to_return_state();
}
}
impl Display for TokenizerError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
let snippet = Snippet::source(self.source.as_str()).fold(true).annotation(
AnnotationKind::Primary
.span(self.offset..self.offset)
.label(self.code.as_str()),
);
let report = Level::ERROR
.primary_title("HTML tokenizer error:")
.element(snippet);
let renderer = annotate_snippets::Renderer::plain();
let display = renderer.render(&[report]);
write!(f, "{display}")
}
}
impl ErrorCode {
fn as_str(&self) -> &'static str {
match self {
ErrorCode::UnexpectedNullCharacter => "unexpected-null-character",
ErrorCode::UnexpectedQuestionMarkInsteadOfTagName => {
"unexpected-question-mark-instead-of-tag-name"
}
ErrorCode::UnexpectedEqualsSignBeforeAttributeName => {
"unexpected-equals-sign-before-attribute-name"
}
ErrorCode::UnexpectedCharacterInAttributeName => {
"unexpected-character-in-attribute-name"
}
ErrorCode::UnexpectedCharacterInUnquotedAttributeValue => {
"unexpected-character-in-unquoted-attribute-value "
}
ErrorCode::UnexpectedSolidusInTag => "unexpected-solidus-in-tag",
ErrorCode::MissingSemiColonAfterCharacterReference => {
"missing-semicolon-after-character-reference"
}
ErrorCode::MissingEndTagName => "missing-end-tag-name",
ErrorCode::MissingAttributeValue => "missing-attribute-value",
ErrorCode::MissingWhitespaceBetweenAttributes => {
"missing-whitespace-between-attributes"
}
ErrorCode::UnknownNamedCharacterReference => "unknown-named-character-reference",
ErrorCode::AbsenceOfDigitsInNumericCharacterReference => {
"absence-of-digits-in-numeric-character-reference"
}
ErrorCode::NullCharacterReference => "null-character-reference",
ErrorCode::CharacterReferenceOutsideUnicodeRange => {
"character-reference-outside-unicode-range"
}
ErrorCode::SurrogateCharacterRefference => "surrogate-character-reference",
ErrorCode::NonCharacterReference => "noncharacter-character-reference",
ErrorCode::ControlCharacterReference => "control-character-reference",
ErrorCode::EofBeforeTagName => "eof-before-tag-name",
ErrorCode::EofInTag => "eof-in-tag",
ErrorCode::InvalidFirstCharacterOfTagName => "invalid-first-character-of-tag-name",
ErrorCode::IncorrectlyOpenedComment => "incorrectly-opened-comment",
ErrorCode::EofInComment => "eof-in-comment",
ErrorCode::NestedComment => "nested-comment",
ErrorCode::IncorrectlyClosedComment => "incorrectly-closed-comment",
ErrorCode::EofInDoctype => "eof-in-doctype",
ErrorCode::MissingWhitespaceBeforeDoctypeName => {
"missing-whitespace-before-doctype-name"
}
ErrorCode::MissingDoctypeName => "missing-dotype-namee",
ErrorCode::InvalidCharacterSequenceAfterDoctypeName => {
"invalid-character-sequence-after-doctype-name"
}
ErrorCode::MissingWhitespaceAfterDoctypePublicKeyword => {
"missing-whitespace-after-doctype-public-keyword"
}
ErrorCode::MissingDoctypePublicIdentifier => "missing-doctype-public-identifier",
ErrorCode::MissingQuoteBeforeDoctypePublicIdentifier => {
"missing-quote-before-doctype-public-identifier"
}
ErrorCode::AbruptDoctypePublicIdentifier => "abrupt-doctype-public-identifier",
ErrorCode::MissingWhitespaceBetweenDoctypePublicAndSystemIdentifiers => {
"missing-whitespace-between-doctype-public-and-system-identifiers"
}
ErrorCode::MissingQuoteBeforeDoctypeSystemIdentifier => {
"missing-quote-before-doctype-system-identifier"
}
ErrorCode::MissingWhitespaceAfterDoctypeSystemKeyword => {
"missing-whitespace-after-doctype-system-keyword"
}
ErrorCode::MissingDoctypeSystemIdentifier => "missing-doctype-system-identifier",
ErrorCode::AbruptDoctypeSystemIdentifier => "abrupt-doctype-system-identifier",
ErrorCode::UnexpectedCharacterAfterDoctypeSystemIdentifier => {
"missing-doctype-system-identifier"
}
ErrorCode::EofInCdata => "eof-in-cdata",
ErrorCode::EofInScriptHtmlCommentLikeText => "eof-in-script-html-comment-like-text",
ErrorCode::DuplicateAttribute => "duplicate-attribute",
ErrorCode::AbruptClosingOfEmptyComment => "abrupt-closing-of-empty-comment",
ErrorCode::EndTagWithTrailingSolidus => "end-tag-with-trailing-solidus",
ErrorCode::EndTagWithAttributes => "end-tag-with-attributes",
}
}
}
fn is_c0_control(code: u32) -> bool {
(0x0000..=0x001F).contains(&code)
}
fn is_control(code: u32) -> bool {
is_c0_control(code) || (0x007f..=0x009f).contains(&code)
}
fn is_ascii_whitespace(code: u32) -> bool {
matches!(code, 0x0009 | 0x000A | 0x000C | 0x0020)
}