use std::collections::HashMap;
use std::sync::LazyLock;
use crate::{
html::grammar::{chars, HtmlParseError, HTML_NAMESPACE},
xpath::grammar::XpathItemTreeNode,
};
use super::{
named_character_references::{NAMED_CHARACTER_REFS, NAMED_CHARACTER_REFS_MAX_LENGTH},
Attribute, CommentToken, DoctypeToken, HtmlToken, TagToken, TagTokenType, Tokenizer,
TokenizerError, TokenizerState,
};
impl<'a> Tokenizer<'a> {
pub(super) fn data_state(&mut self) -> Result<(), HtmlParseError> {
match self.input_stream.next() {
Some(c) => match c {
'&' => {
self.return_state = Some(TokenizerState::Data);
self.state = TokenizerState::CharacterReference;
}
'<' => {
self.state = TokenizerState::TagOpen;
}
&chars::NULL => {
let current_input_character = *c;
self.handle_error(TokenizerError::UnexpectedNullCharacter)?;
self.emit(HtmlToken::Character(current_input_character))?;
}
_ => {
let first = *c;
let rest = self.input_stream.consume_while(|c| *c != '&' && *c != '<' && *c != '\0');
if rest.is_empty() {
self.emit(HtmlToken::Character(first))?;
} else {
let mut batch = String::with_capacity(1 + rest.len());
batch.push(first);
batch.extend(rest.iter());
self.emit(HtmlToken::Characters(batch))?;
}
}
},
None => self.emit(HtmlToken::EndOfFile)?,
};
Ok(())
}
pub(super) fn rcdata_state(&mut self) -> Result<(), HtmlParseError> {
match self.input_stream.next() {
Some(c) => match c {
'&' => {
self.return_state = Some(TokenizerState::RCDATA);
self.state = TokenizerState::CharacterReference;
}
'<' => {
self.state = TokenizerState::RCDATALessThanSign;
}
&chars::NULL => {
self.handle_error(TokenizerError::UnexpectedNullCharacter)?;
self.emit(HtmlToken::Character(chars::REPLACEMENT_CHARACTER))?;
}
_ => {
let first = *c;
let rest = self.input_stream.consume_while(|c| *c != '&' && *c != '<' && *c != '\0');
if rest.is_empty() {
self.emit(HtmlToken::Character(first))?;
} else {
let mut batch = String::with_capacity(1 + rest.len());
batch.push(first);
batch.extend(rest.iter());
self.emit(HtmlToken::Characters(batch))?;
}
}
},
None => self.emit(HtmlToken::EndOfFile)?,
};
Ok(())
}
pub(super) fn rawtext_state(&mut self) -> Result<(), HtmlParseError> {
match self.input_stream.next() {
Some(c) => match c {
'<' => {
self.state = TokenizerState::RAWTEXTLessThanSign;
}
&chars::NULL => {
self.handle_error(TokenizerError::UnexpectedNullCharacter)?;
self.emit(HtmlToken::Character(chars::REPLACEMENT_CHARACTER))?;
}
_ => {
let first = *c;
let rest = self.input_stream.consume_while(|c| *c != '<' && *c != '\0');
if rest.is_empty() {
self.emit(HtmlToken::Character(first))?;
} else {
let mut batch = String::with_capacity(1 + rest.len());
batch.push(first);
batch.extend(rest.iter());
self.emit(HtmlToken::Characters(batch))?;
}
}
},
None => self.emit(HtmlToken::EndOfFile)?,
};
Ok(())
}
pub(super) fn script_data_state(&mut self) -> Result<(), HtmlParseError> {
match self.input_stream.next() {
Some('<') => {
self.state = TokenizerState::ScriptDataLessThanSign;
}
Some(&chars::NULL) => {
self.handle_error(TokenizerError::UnexpectedNullCharacter)?;
self.emit(HtmlToken::Character(chars::REPLACEMENT_CHARACTER))?;
}
None => self.emit(HtmlToken::EndOfFile)?,
Some(c) => {
let first = *c;
let rest = self.input_stream.consume_while(|c| *c != '<' && *c != '\0');
if rest.is_empty() {
self.emit(HtmlToken::Character(first))?;
} else {
let mut batch = String::with_capacity(1 + rest.len());
batch.push(first);
batch.extend(rest.iter());
self.emit(HtmlToken::Characters(batch))?;
}
}
};
Ok(())
}
pub(super) fn tag_open_state(&mut self) -> Result<(), HtmlParseError> {
match self.input_stream.next() {
Some(c) => match c {
'!' => {
self.state = TokenizerState::MarkupDeclarationOpen;
}
'/' => {
self.state = TokenizerState::EndTagOpen;
}
'?' => {
self.handle_error(TokenizerError::UnexpectedQuestionMarkInsteadOfTagName)?;
self.comment_token = Some(CommentToken {
data: String::new(),
});
self.reconsume_in_state(TokenizerState::BogusComment)?;
}
_ if c.is_ascii_alphabetic() => {
self.tag_token = Some(TagTokenType::StartTag(TagToken::new(String::new())));
self.reconsume_in_state(TokenizerState::TagName)?;
}
_ => {
self.handle_error(TokenizerError::InvalidFirstCharacterOfTagName)?;
self.emit(HtmlToken::Character('<'))?;
self.reconsume_in_state(TokenizerState::Data)?;
}
},
None => {
self.handle_error(TokenizerError::EofBeforeTagName)?;
self.emit(HtmlToken::Character('<'))?;
self.reconsume_in_state(TokenizerState::Data)?
}
};
Ok(())
}
pub(super) fn end_tag_open_state(&mut self) -> Result<(), HtmlParseError> {
match self.input_stream.next() {
Some(c) => match c {
_ if c.is_ascii_alphabetic() => {
self.tag_token = Some(TagTokenType::EndTag(TagToken::new(String::new())));
self.reconsume_in_state(TokenizerState::TagName)?;
}
'>' => {
self.handle_error(TokenizerError::MissingEndTagName)?;
self.state = TokenizerState::Data;
}
_ => {
self.handle_error(TokenizerError::InvalidFirstCharacterOfTagName)?;
self.comment_token = Some(CommentToken::new(String::new()));
self.reconsume_in_state(TokenizerState::BogusComment)?;
}
},
None => {
self.handle_error(TokenizerError::EofBeforeTagName)?;
self.emit(HtmlToken::Character('<'))?;
self.emit(HtmlToken::Character('/'))?;
self.reconsume_in_state(TokenizerState::Data)?;
}
};
Ok(())
}
pub(super) fn tag_name_state(&mut self) -> Result<(), HtmlParseError> {
match self.input_stream.next() {
Some(c) => match c {
c @ (&chars::CHARACTER_TABULATION
| &chars::LINE_FEED
| &chars::FORM_FEED
| &chars::SPACE) => {
self.attribute_prefix_buffer.push(*c);
self.state = TokenizerState::BeforeAttributeName;
}
&'/' => {
self.state = TokenizerState::SelfClosingStartTag;
}
&'>' => {
self.state = TokenizerState::Data;
self.emit_current_tag_token()?;
}
&chars::NULL => {
self.handle_error(TokenizerError::UnexpectedNullCharacter)?;
self.current_tag_token_mut()?
.tag_name_mut()
.push(chars::REPLACEMENT_CHARACTER);
}
_ if c.is_ascii_uppercase() => {
let c = c.to_ascii_lowercase();
self.current_tag_token_mut()?.tag_name_mut().push(c);
}
_ => {
let first = *c;
let rest = self.input_stream.consume_while(|c| {
!c.is_ascii_uppercase()
&& *c != '\t'
&& *c != '\n'
&& *c != '\x0C'
&& *c != ' '
&& *c != '/'
&& *c != '>'
&& *c != '\0'
});
let tag_name = self
.tag_token
.as_mut()
.ok_or_else(|| HtmlParseError::new("no current tag found"))?
.tag_name_mut();
tag_name.reserve(1 + rest.len());
tag_name.push(first);
tag_name.extend(rest.iter());
}
},
None => {
self.handle_error(TokenizerError::EofInTag)?;
self.emit(HtmlToken::EndOfFile)?;
}
};
Ok(())
}
pub(super) fn rcdata_less_than_sign_state(&mut self) -> Result<(), HtmlParseError> {
match self.input_stream.next() {
Some('/') => {
self.temporary_buffer.clear();
self.state = TokenizerState::RCDATAEndTagOpen;
}
_ => {
self.emit(HtmlToken::Character('<'))?;
self.reconsume_in_state(TokenizerState::RCDATA)?;
}
}
Ok(())
}
pub(super) fn rcdata_end_tag_open_state(&mut self) -> Result<(), HtmlParseError> {
match self.input_stream.next() {
Some(c) if c.is_ascii_alphabetic() => {
self.tag_token = Some(TagTokenType::EndTag(TagToken::new(String::new())));
self.reconsume_in_state(TokenizerState::RCDATAEndTagName)?;
}
_ => {
self.emit(HtmlToken::Character('<'))?;
self.emit(HtmlToken::Character('/'))?;
self.reconsume_in_state(TokenizerState::RCDATA)?;
}
}
Ok(())
}
pub(super) fn rcdata_end_tag_name_state(&mut self) -> Result<(), HtmlParseError> {
fn anything_else(tokenizer: &mut Tokenizer) -> Result<(), HtmlParseError> {
tokenizer.emit(HtmlToken::Character('<'))?;
tokenizer.emit(HtmlToken::Character('/'))?;
let chars: Vec<char> = tokenizer.temporary_buffer.drain(..).collect();
for c in chars.into_iter() {
tokenizer.emit(HtmlToken::Character(c))?;
}
tokenizer.reconsume_in_state(TokenizerState::RCDATA)?;
Ok(())
}
match self.input_stream.next() {
Some(
&chars::CHARACTER_TABULATION
| &chars::LINE_FEED
| &chars::FORM_FEED
| &chars::SPACE,
) => {
if self.is_current_end_tag_token_appropriate() {
self.state = TokenizerState::BeforeAttributeName;
return Ok(());
}
anything_else(self)?;
}
Some('/') => {
if self.is_current_end_tag_token_appropriate() {
self.state = TokenizerState::SelfClosingStartTag;
return Ok(());
}
anything_else(self)?;
}
Some('>') => {
if self.is_current_end_tag_token_appropriate() {
self.state = TokenizerState::Data;
self.emit_current_tag_token()?;
return Ok(());
}
anything_else(self)?;
}
Some(c) if c.is_ascii_uppercase() => {
let c = *c;
let lowercase = c.to_ascii_lowercase();
self.current_tag_token_mut()?.tag_name_mut().push(lowercase);
self.temporary_buffer.push(c);
}
Some(c) if c.is_ascii_lowercase() => {
let c = *c;
self.current_tag_token_mut()?.tag_name_mut().push(c);
self.temporary_buffer.push(c);
}
_ => anything_else(self)?,
}
Ok(())
}
pub(super) fn script_data_less_than_sign_state(&mut self) -> Result<(), HtmlParseError> {
match self.input_stream.next() {
Some('/') => {
self.temporary_buffer.clear();
self.state = TokenizerState::ScriptDataEndTagOpen;
}
Some('!') => {
self.state = TokenizerState::ScriptDataEscapeStart;
self.emit(HtmlToken::Character('<'))?;
self.emit(HtmlToken::Character('!'))?;
}
_ => {
self.emit(HtmlToken::Character('<'))?;
self.reconsume_in_state(TokenizerState::ScriptData)?;
}
}
Ok(())
}
pub(super) fn script_data_end_tag_open_state(&mut self) -> Result<(), HtmlParseError> {
let next = self.input_stream.next();
match next {
Some(c) if c.is_ascii_alphabetic() => {
self.tag_token = Some(TagTokenType::EndTag(TagToken::new(String::new())));
self.reconsume_in_state(TokenizerState::ScriptDataEndTagName)?;
}
_ => {
self.emit(HtmlToken::Character('<'))?;
self.emit(HtmlToken::Character('/'))?;
self.reconsume_in_state(TokenizerState::ScriptData)?;
}
}
Ok(())
}
pub(super) fn script_data_end_tag_name_state(&mut self) -> Result<(), HtmlParseError> {
fn anything_else(tokenizer: &mut Tokenizer) -> Result<(), HtmlParseError> {
tokenizer.emit(HtmlToken::Character('<'))?;
tokenizer.emit(HtmlToken::Character('/'))?;
let chars: Vec<char> = tokenizer.temporary_buffer.drain(..).collect();
for c in chars.into_iter() {
tokenizer.emit(HtmlToken::Character(c))?;
}
tokenizer.reconsume_in_state(TokenizerState::ScriptData)?;
Ok(())
}
match self.input_stream.next() {
Some(
&chars::CHARACTER_TABULATION
| &chars::LINE_FEED
| &chars::FORM_FEED
| &chars::SPACE,
) => {
if self.is_current_end_tag_token_appropriate() {
self.state = TokenizerState::BeforeAttributeName;
return Ok(());
}
anything_else(self)?;
}
Some('/') => {
if self.is_current_end_tag_token_appropriate() {
self.state = TokenizerState::SelfClosingStartTag;
return Ok(());
}
anything_else(self)?;
}
Some('>') => {
if self.is_current_end_tag_token_appropriate() {
self.state = TokenizerState::Data;
self.emit_current_tag_token()?;
return Ok(());
}
anything_else(self)?;
}
Some(c) if c.is_ascii_uppercase() => {
let c = *c;
let lowercase = c.to_ascii_lowercase();
self.current_tag_token_mut()?.tag_name_mut().push(lowercase);
self.temporary_buffer.push(c);
}
Some(c) if c.is_ascii_lowercase() => {
let c = *c;
self.current_tag_token_mut()?.tag_name_mut().push(c);
self.temporary_buffer.push(c);
}
_ => anything_else(self)?,
}
Ok(())
}
pub(super) fn script_data_escape_start_state(&mut self) -> Result<(), HtmlParseError> {
match self.input_stream.next() {
Some('-') => {
self.state = TokenizerState::ScriptDataEscapeStartDash;
self.emit(HtmlToken::Character('-'))?;
}
_ => {
self.reconsume_in_state(TokenizerState::ScriptData)?;
}
}
Ok(())
}
pub(super) fn script_data_escape_start_dash_state(&mut self) -> Result<(), HtmlParseError> {
match self.input_stream.next() {
Some('-') => {
self.state = TokenizerState::ScriptDataEscapedDashDash;
self.emit(HtmlToken::Character('-'))?;
}
_ => {
self.reconsume_in_state(TokenizerState::ScriptData)?;
}
}
Ok(())
}
pub(super) fn script_data_escaped_state(&mut self) -> Result<(), HtmlParseError> {
match self.input_stream.next() {
Some('-') => {
self.state = TokenizerState::ScriptDataEscapedDash;
self.emit(HtmlToken::Character('-'))?;
}
Some('<') => {
self.state = TokenizerState::ScriptDataEscapedLessThanSign;
}
Some(&chars::NULL) => {
self.handle_error(TokenizerError::UnexpectedNullCharacter)?;
self.emit(HtmlToken::Character(chars::REPLACEMENT_CHARACTER))?;
}
None => {
self.handle_error(TokenizerError::EofInScriptHtmlCommentLikeText)?;
self.emit(HtmlToken::EndOfFile)?;
}
Some(c) => {
let c = *c;
self.emit(HtmlToken::Character(c))?;
}
}
Ok(())
}
pub(super) fn script_data_escaped_dash_state(&mut self) -> Result<(), HtmlParseError> {
match self.input_stream.next() {
Some('-') => {
self.state = TokenizerState::ScriptDataEscapedDashDash;
self.emit(HtmlToken::Character('-'))?;
}
Some('<') => {
self.state = TokenizerState::ScriptDataEscapedLessThanSign;
}
Some(&chars::NULL) => {
self.handle_error(TokenizerError::UnexpectedNullCharacter)?;
self.state = TokenizerState::ScriptDataEscaped;
self.emit(HtmlToken::Character(chars::REPLACEMENT_CHARACTER))?;
}
None => {
self.handle_error(TokenizerError::EofInScriptHtmlCommentLikeText)?;
self.emit(HtmlToken::EndOfFile)?;
}
Some(c) => {
let c = *c;
self.state = TokenizerState::ScriptDataEscaped;
self.emit(HtmlToken::Character(c))?;
}
}
Ok(())
}
pub(super) fn script_data_escaped_dash_dash_state(&mut self) -> Result<(), HtmlParseError> {
match self.input_stream.next() {
Some('-') => {
self.emit(HtmlToken::Character('-'))?;
}
Some('<') => {
self.state = TokenizerState::ScriptDataEscapedLessThanSign;
}
Some('>') => {
self.state = TokenizerState::ScriptData;
self.emit(HtmlToken::Character('>'))?;
}
Some(&chars::NULL) => {
self.handle_error(TokenizerError::UnexpectedNullCharacter)?;
self.state = TokenizerState::ScriptDataEscaped;
self.emit(HtmlToken::Character(chars::REPLACEMENT_CHARACTER))?;
}
None => {
self.handle_error(TokenizerError::EofInScriptHtmlCommentLikeText)?;
self.emit(HtmlToken::EndOfFile)?;
}
Some(c) => {
let c = *c;
self.state = TokenizerState::ScriptDataEscaped;
self.emit(HtmlToken::Character(c))?;
}
}
Ok(())
}
pub(super) fn script_data_escaped_less_than_sign_state(
&mut self,
) -> Result<(), HtmlParseError> {
match self.input_stream.next() {
Some('/') => {
self.temporary_buffer.clear();
self.state = TokenizerState::ScriptDataEscapedEndTagOpen;
}
Some(c) if c.is_ascii_alphabetic() => {
self.temporary_buffer.clear();
self.emit(HtmlToken::Character('<'))?;
self.reconsume_in_state(TokenizerState::ScriptDataDoubleEscapeStart)?;
}
_ => {
self.emit(HtmlToken::Character('<'))?;
self.reconsume_in_state(TokenizerState::ScriptDataEscaped)?;
}
}
Ok(())
}
pub(super) fn script_data_escaped_end_tag_open_state(&mut self) -> Result<(), HtmlParseError> {
match self.input_stream.next() {
Some(c) if c.is_ascii_alphabetic() => {
self.tag_token = Some(TagTokenType::EndTag(TagToken::new(String::new())));
self.reconsume_in_state(TokenizerState::ScriptDataEscapedEndTagName)?;
}
_ => {
self.emit(HtmlToken::Character('<'))?;
self.emit(HtmlToken::Character('/'))?;
self.reconsume_in_state(TokenizerState::ScriptDataEscaped)?;
}
}
Ok(())
}
pub(super) fn script_data_escaped_end_tag_name_state(&mut self) -> Result<(), HtmlParseError> {
fn anything_else(tokenizer: &mut Tokenizer) -> Result<(), HtmlParseError> {
tokenizer.emit(HtmlToken::Character('<'))?;
tokenizer.emit(HtmlToken::Character('/'))?;
let chars: Vec<char> = tokenizer.temporary_buffer.drain(..).collect();
for c in chars.into_iter() {
tokenizer.emit(HtmlToken::Character(c))?;
}
tokenizer.reconsume_in_state(TokenizerState::ScriptDataEscaped)?;
Ok(())
}
match self.input_stream.next() {
Some(
&chars::CHARACTER_TABULATION
| &chars::LINE_FEED
| &chars::FORM_FEED
| &chars::SPACE,
) => {
if self.is_current_end_tag_token_appropriate() {
self.state = TokenizerState::BeforeAttributeName;
return Ok(());
}
anything_else(self)?;
}
Some('/') => {
if self.is_current_end_tag_token_appropriate() {
self.state = TokenizerState::SelfClosingStartTag;
return Ok(());
}
anything_else(self)?;
}
Some('>') => {
if self.is_current_end_tag_token_appropriate() {
self.state = TokenizerState::Data;
self.emit_current_tag_token()?;
return Ok(());
}
anything_else(self)?;
}
Some(c) if c.is_ascii_uppercase() => {
let c = *c;
let lowercase = c.to_ascii_lowercase();
self.current_tag_token_mut()?.tag_name_mut().push(lowercase);
self.temporary_buffer.push(c);
}
Some(c) if c.is_ascii_lowercase() => {
let c = *c;
self.current_tag_token_mut()?.tag_name_mut().push(c);
self.temporary_buffer.push(c);
}
_ => anything_else(self)?,
}
Ok(())
}
pub(super) fn script_data_double_escape_start_state(&mut self) -> Result<(), HtmlParseError> {
match self.input_stream.next() {
Some(c)
if [
chars::CHARACTER_TABULATION,
chars::LINE_FEED,
chars::FORM_FEED,
chars::SPACE,
'/',
'>',
]
.contains(c) =>
{
let c = *c;
if self.temporary_buffer.iter().copied().eq("script".chars()) {
self.state = TokenizerState::ScriptDataDoubleEscaped;
} else {
self.state = TokenizerState::ScriptDataEscaped;
}
self.emit(HtmlToken::Character(c))?;
}
Some(c) if c.is_ascii_uppercase() => {
let c = *c;
let lowercase = c.to_ascii_lowercase();
self.temporary_buffer.push(lowercase);
self.emit(HtmlToken::Character(c))?;
}
Some(c) if c.is_ascii_lowercase() => {
let c = *c;
self.temporary_buffer.push(c);
self.emit(HtmlToken::Character(c))?;
}
_ => {
self.reconsume_in_state(TokenizerState::ScriptDataEscaped)?;
}
}
Ok(())
}
pub(super) fn script_data_double_escaped_state(&mut self) -> Result<(), HtmlParseError> {
match self.input_stream.next() {
Some('-') => {
self.state = TokenizerState::ScriptDataDoubleEscapedDash;
self.emit(HtmlToken::Character('-'))?;
}
Some('<') => {
self.state = TokenizerState::ScriptDataDoubleEscapedLessThanSign;
self.emit(HtmlToken::Character('<'))?;
}
Some(&chars::NULL) => {
self.handle_error(TokenizerError::UnexpectedNullCharacter)?;
self.emit(HtmlToken::Character(chars::REPLACEMENT_CHARACTER))?;
}
None => {
self.handle_error(TokenizerError::EofInScriptHtmlCommentLikeText)?;
self.emit(HtmlToken::EndOfFile)?;
}
Some(c) => {
let c = *c;
self.emit(HtmlToken::Character(c))?;
}
}
Ok(())
}
pub(super) fn script_data_double_escaped_dash_state(&mut self) -> Result<(), HtmlParseError> {
match self.input_stream.next() {
Some('-') => {
self.state = TokenizerState::ScriptDataDoubleEscapedDashDash;
self.emit(HtmlToken::Character('-'))?;
}
Some('<') => {
self.state = TokenizerState::ScriptDataDoubleEscapedLessThanSign;
self.emit(HtmlToken::Character('<'))?;
}
Some(&chars::NULL) => {
self.handle_error(TokenizerError::UnexpectedNullCharacter)?;
self.state = TokenizerState::ScriptDataDoubleEscaped;
self.emit(HtmlToken::Character(chars::REPLACEMENT_CHARACTER))?;
}
None => {
self.handle_error(TokenizerError::EofInScriptHtmlCommentLikeText)?;
self.emit(HtmlToken::EndOfFile)?;
}
Some(c) => {
self.state = TokenizerState::ScriptDataDoubleEscaped;
let c = *c;
self.emit(HtmlToken::Character(c))?;
}
}
Ok(())
}
pub(super) fn script_data_double_escaped_dash_dash_state(
&mut self,
) -> Result<(), HtmlParseError> {
match self.input_stream.next() {
Some('-') => {
self.emit(HtmlToken::Character('-'))?;
}
Some('<') => {
self.state = TokenizerState::ScriptDataDoubleEscapedLessThanSign;
self.emit(HtmlToken::Character('<'))?;
}
Some('>') => {
self.state = TokenizerState::ScriptData;
self.emit(HtmlToken::Character('>'))?;
}
Some(&chars::NULL) => {
self.handle_error(TokenizerError::UnexpectedNullCharacter)?;
self.state = TokenizerState::ScriptDataDoubleEscaped;
self.emit(HtmlToken::Character(chars::REPLACEMENT_CHARACTER))?;
}
None => {
self.handle_error(TokenizerError::EofInScriptHtmlCommentLikeText)?;
self.emit(HtmlToken::EndOfFile)?;
}
Some(c) => {
self.state = TokenizerState::ScriptDataDoubleEscaped;
let c = *c;
self.emit(HtmlToken::Character(c))?;
}
}
Ok(())
}
pub(super) fn script_data_double_escaped_less_than_sign_state(
&mut self,
) -> Result<(), HtmlParseError> {
match self.input_stream.next() {
Some('/') => {
self.temporary_buffer.clear();
self.state = TokenizerState::ScriptDataDoubleEscapeEnd;
self.emit(HtmlToken::Character('/'))?;
}
_ => {
self.reconsume_in_state(TokenizerState::ScriptDataDoubleEscaped)?;
}
}
Ok(())
}
pub(super) fn script_data_double_escape_end_state(&mut self) -> Result<(), HtmlParseError> {
match self.input_stream.next() {
Some(c)
if [
chars::CHARACTER_TABULATION,
chars::LINE_FEED,
chars::FORM_FEED,
chars::SPACE,
'/',
'>',
]
.contains(c) =>
{
let c = *c;
if self.temporary_buffer.iter().copied().eq("script".chars()) {
self.state = TokenizerState::ScriptDataEscaped;
} else {
self.state = TokenizerState::ScriptDataDoubleEscaped;
}
self.emit(HtmlToken::Character(c))?;
}
Some(c) if c.is_ascii_uppercase() => {
let c = *c;
let lowercase = c.to_ascii_lowercase();
self.temporary_buffer.push(lowercase);
self.emit(HtmlToken::Character(c))?;
}
Some(c) if c.is_ascii_lowercase() => {
let c = *c;
self.temporary_buffer.push(c);
self.emit(HtmlToken::Character(c))?;
}
_ => {
self.reconsume_in_state(TokenizerState::ScriptDataDoubleEscaped)?;
}
}
Ok(())
}
pub(super) fn before_attribute_name_state(&mut self) -> Result<(), HtmlParseError> {
match self.input_stream.next() {
Some(
c @ (&chars::CHARACTER_TABULATION
| &chars::LINE_FEED
| &chars::FORM_FEED
| &chars::SPACE),
) => {
self.attribute_prefix_buffer.push(*c);
}
Some(c) if ['/', '>'].contains(c) => {
self.reconsume_in_state(TokenizerState::AfterAttributeName)?;
}
Some('=') => {
self.handle_error(TokenizerError::UnexpectedEqualsSignBeforeAttributeName)?;
let attribute = Attribute::new(String::from('='), String::new());
self.create_new_attribute(attribute)?;
self.state = TokenizerState::AttributeName;
}
None => {
self.reconsume_in_state(TokenizerState::AfterAttributeName)?;
}
Some(_) => {
let attribute = Attribute::new(String::new(), String::new());
self.create_new_attribute(attribute)?;
self.reconsume_in_state(TokenizerState::AttributeName)?;
}
}
Ok(())
}
pub(super) fn attribute_name_state(&mut self) -> Result<(), HtmlParseError> {
match self.input_stream.next() {
Some(
&chars::CHARACTER_TABULATION
| &chars::LINE_FEED
| &chars::FORM_FEED
| &chars::SPACE
| '/'
| '>',
) => {
self.reconsume_in_state(TokenizerState::AfterAttributeName)?;
}
None => {
self.reconsume_in_state(TokenizerState::AfterAttributeName)?;
}
Some('=') => {
self.state = TokenizerState::BeforeAttributeValue;
}
Some(c) if c.is_ascii_uppercase() => {
let original = *c;
let c = c.to_ascii_lowercase();
self.push_char_to_attribute_name_with_original(c, original)?;
}
Some(&chars::NULL) => {
self.handle_error(TokenizerError::UnexpectedNullCharacter)?;
self.push_char_to_attribute_name(chars::REPLACEMENT_CHARACTER)?;
}
Some(c) if ['"', '\'', '<'].contains(c) => {
let c = *c;
self.handle_error(TokenizerError::UnexpectedCharacterInAttributeName)?;
self.push_char_to_attribute_name(c)?;
}
Some(c) => {
let first = *c;
let rest = self.input_stream.consume_while(|c| {
!c.is_ascii_uppercase()
&& *c != '\t'
&& *c != '\n'
&& *c != '\x0C'
&& *c != ' '
&& *c != '/'
&& *c != '>'
&& *c != '='
&& *c != '\0'
&& *c != '"'
&& *c != '\''
&& *c != '<'
});
let attr = self
.tag_token
.as_mut()
.ok_or_else(|| HtmlParseError::new("no current tag found"))?
.attributes_mut()
.last_mut()
.ok_or_else(|| HtmlParseError::new("no attributes on current tag"))?;
attr.name.reserve(1 + rest.len());
attr.name.push(first);
attr.name.extend(rest.iter());
if let Some(ref mut orig) = attr.original_name {
orig.reserve(1 + rest.len());
orig.push(first);
orig.extend(rest.iter());
}
}
}
Ok(())
}
pub(super) fn after_attribute_name_state(&mut self) -> Result<(), HtmlParseError> {
match self.input_stream.next() {
Some(
&chars::CHARACTER_TABULATION
| &chars::LINE_FEED
| &chars::FORM_FEED
| &chars::SPACE,
) => {
}
Some('/') => {
self.state = TokenizerState::SelfClosingStartTag;
}
Some('=') => {
self.state = TokenizerState::BeforeAttributeValue;
}
Some('>') => {
self.state = TokenizerState::Data;
self.emit_current_tag_token()?;
}
None => {
self.handle_error(TokenizerError::EofInTag)?;
self.emit(HtmlToken::EndOfFile)?;
}
Some(_) => {
let attribute = Attribute::new(String::new(), String::new());
self.create_new_attribute(attribute)?;
self.reconsume_in_state(TokenizerState::AttributeName)?;
}
}
Ok(())
}
pub(super) fn before_attribute_value_state(&mut self) -> Result<(), HtmlParseError> {
match self.input_stream.next() {
Some(
&chars::CHARACTER_TABULATION
| &chars::LINE_FEED
| &chars::FORM_FEED
| &chars::SPACE,
) => {
}
Some('"') => {
self.state = TokenizerState::AttributeValueDoubleQuoted;
}
Some('\'') => {
self.state = TokenizerState::AttributeValueSingleQuoted;
}
Some('>') => {
self.handle_error(TokenizerError::MissingAttributeValue)?;
self.state = TokenizerState::Data;
self.emit_current_tag_token()?;
}
Some(_) | None => {
self.reconsume_in_state(TokenizerState::AttributeValueUnquoted)?;
}
}
Ok(())
}
pub(super) fn attribute_value_double_quoted_state(&mut self) -> Result<(), HtmlParseError> {
match self.input_stream.next() {
Some('"') => {
self.state = TokenizerState::AfterAttributeValueQuoted;
}
Some('&') => {
self.return_state = Some(TokenizerState::AttributeValueDoubleQuoted);
self.state = TokenizerState::CharacterReference;
}
Some(&chars::NULL) => {
self.handle_error(TokenizerError::UnexpectedNullCharacter)?;
self.push_char_to_attribute_value(chars::REPLACEMENT_CHARACTER)?;
}
None => {
self.handle_error(TokenizerError::EofInTag)?;
self.emit(HtmlToken::EndOfFile)?;
}
Some(c) => {
let first = *c;
let rest = self.input_stream.consume_while(|c| *c != '"' && *c != '&' && *c != '\0');
let attr = self
.tag_token
.as_mut()
.ok_or_else(|| HtmlParseError::new("no current tag found"))?
.attributes_mut()
.last_mut()
.ok_or_else(|| HtmlParseError::new("no attributes on current tag"))?;
attr.value.reserve(1 + rest.len());
attr.value.push(first);
attr.value.extend(rest.iter());
}
}
Ok(())
}
pub(super) fn attribute_value_single_quoted_state(&mut self) -> Result<(), HtmlParseError> {
match self.input_stream.next() {
Some('\'') => {
self.state = TokenizerState::AfterAttributeValueQuoted;
}
Some('&') => {
self.return_state = Some(TokenizerState::AttributeValueSingleQuoted);
self.state = TokenizerState::CharacterReference;
}
Some(&chars::NULL) => {
self.handle_error(TokenizerError::UnexpectedNullCharacter)?;
self.push_char_to_attribute_value(chars::REPLACEMENT_CHARACTER)?;
}
None => {
self.handle_error(TokenizerError::EofInTag)?;
self.emit(HtmlToken::EndOfFile)?;
}
Some(c) => {
let first = *c;
let rest = self.input_stream.consume_while(|c| *c != '\'' && *c != '&' && *c != '\0');
let attr = self
.tag_token
.as_mut()
.ok_or_else(|| HtmlParseError::new("no current tag found"))?
.attributes_mut()
.last_mut()
.ok_or_else(|| HtmlParseError::new("no attributes on current tag"))?;
attr.value.reserve(1 + rest.len());
attr.value.push(first);
attr.value.extend(rest.iter());
}
}
Ok(())
}
pub(super) fn attribute_value_unquoted_state(&mut self) -> Result<(), HtmlParseError> {
match self.input_stream.next() {
Some(
c @ (&chars::CHARACTER_TABULATION
| &chars::LINE_FEED
| &chars::FORM_FEED
| &chars::SPACE),
) => {
self.attribute_prefix_buffer.push(*c);
self.state = TokenizerState::BeforeAttributeName;
}
Some('&') => {
self.return_state = Some(TokenizerState::AttributeValueUnquoted);
self.state = TokenizerState::CharacterReference;
}
Some('>') => {
self.state = TokenizerState::Data;
self.emit_current_tag_token()?;
}
Some(&chars::NULL) => {
self.handle_error(TokenizerError::UnexpectedNullCharacter)?;
self.push_char_to_attribute_value(chars::REPLACEMENT_CHARACTER)?;
}
Some(c) if ['"', '\'', '<', '=', '`'].contains(c) => {
let c = *c;
self.handle_error(TokenizerError::UnexpectedCharacterInUnquotedAttributeValue)?;
self.push_char_to_attribute_value(c)?;
}
None => {
self.handle_error(TokenizerError::EofInTag)?;
self.emit(HtmlToken::EndOfFile)?;
}
Some(c) => {
let attr = self
.tag_token
.as_mut()
.ok_or_else(|| HtmlParseError::new("no current tag found"))?
.attributes_mut()
.last_mut()
.ok_or_else(|| HtmlParseError::new("no attributes on current tag"))?;
attr.value.push(*c);
loop {
let next_char = match self.input_stream.current() {
Some(&c)
if c != '\t'
&& c != '\n'
&& c != '\x0C'
&& c != ' '
&& c != '&'
&& c != '>'
&& c != '\0'
&& c != '"'
&& c != '\''
&& c != '<'
&& c != '='
&& c != '`' =>
{
c
}
_ => break,
};
attr.value.push(next_char);
self.input_stream.next();
}
}
}
Ok(())
}
pub(super) fn after_attribute_value_quoted_state(&mut self) -> Result<(), HtmlParseError> {
match self.input_stream.next() {
Some(
c @ (&chars::CHARACTER_TABULATION
| &chars::LINE_FEED
| &chars::FORM_FEED
| &chars::SPACE),
) => {
self.attribute_prefix_buffer.push(*c);
self.state = TokenizerState::BeforeAttributeName;
}
Some('/') => {
self.state = TokenizerState::SelfClosingStartTag;
}
Some('>') => {
self.state = TokenizerState::Data;
self.emit_current_tag_token()?;
}
None => {
self.handle_error(TokenizerError::EofInTag)?;
self.emit(HtmlToken::EndOfFile)?;
}
Some(_) => {
self.handle_error(TokenizerError::MissingWhitespaceBetweenAttributes)?;
self.reconsume_in_state(TokenizerState::BeforeAttributeName)?;
}
}
Ok(())
}
pub(super) fn self_closing_start_tag_state(&mut self) -> Result<(), HtmlParseError> {
match self.input_stream.next() {
Some('>') => {
*self.current_tag_token_mut()?.self_closing_mut() = true;
self.state = TokenizerState::Data;
self.emit_current_tag_token()?;
}
None => {
self.handle_error(TokenizerError::EofInTag)?;
self.emit(HtmlToken::EndOfFile)?;
}
Some(_) => {
self.handle_error(TokenizerError::UnexpectedSolidusInTag)?;
self.reconsume_in_state(TokenizerState::BeforeAttributeName)?;
}
}
Ok(())
}
pub(super) fn bogus_comment_state(&mut self) -> Result<(), HtmlParseError> {
match self.input_stream.next() {
Some('>') => {
self.state = TokenizerState::Data;
self.emit_current_comment_token()?;
}
None => {
self.emit_current_comment_token()?;
self.emit(HtmlToken::EndOfFile)?;
}
Some(&chars::NULL) => {
self.handle_error(TokenizerError::UnexpectedNullCharacter)?;
self.current_comment_token_mut()?
.data
.push(chars::REPLACEMENT_CHARACTER);
}
Some(c) => {
let c = *c;
self.current_comment_token_mut()?.data.push(c);
}
}
Ok(())
}
pub(super) fn markup_declaration_open_state(&mut self) -> Result<(), HtmlParseError> {
if self.input_stream.peek_add(0) == Some(&'-')
&& self.input_stream.peek_add(1) == Some(&'-')
{
self.input_stream.next_add(2);
self.comment_token = Some(CommentToken::new(String::new()));
self.state = TokenizerState::CommentStart;
return Ok(());
}
let doctype_target = ['D', 'O', 'C', 'T', 'Y', 'P', 'E'];
if doctype_target.iter().enumerate().all(|(i, target)| {
self.input_stream
.peek_add(i)
.is_some_and(|c| c.eq_ignore_ascii_case(target))
}) {
self.input_stream.next_add(7);
self.state = TokenizerState::DOCTYPE;
return Ok(());
}
let cdata_target = ['[', 'C', 'D', 'A', 'T', 'A', '['];
if cdata_target.iter().enumerate().all(|(i, target)| {
self.input_stream.peek_add(i) == Some(target)
}) {
self.input_stream.next_add(7);
if let Some(XpathItemTreeNode::ElementNode(element)) = self.parser.adjusted_current_node() {
if element.namespace.as_deref().unwrap_or(HTML_NAMESPACE) != HTML_NAMESPACE {
self.state = TokenizerState::CDATASection;
return Ok(());
}
}
self.handle_error(TokenizerError::CdataInHtmlContent)?;
self.comment_token = Some(CommentToken::new(String::from("[CDATA[")));
self.state = TokenizerState::BogusComment;
return Ok(());
}
self.handle_error(TokenizerError::IncorrectlyOpenedComment)?;
self.comment_token = Some(CommentToken::new(String::new()));
self.state = TokenizerState::BogusComment;
Ok(())
}
pub(super) fn comment_start_state(&mut self) -> Result<(), HtmlParseError> {
match self.input_stream.next() {
Some('-') => {
self.state = TokenizerState::CommentStartDash;
}
Some('>') => {
self.handle_error(TokenizerError::AbruptClosingOfEmptyComment)?;
self.state = TokenizerState::Data;
self.emit_current_comment_token()?;
}
_ => {
self.reconsume_in_state(TokenizerState::Comment)?;
}
}
Ok(())
}
pub(super) fn comment_start_dash_state(&mut self) -> Result<(), HtmlParseError> {
match self.input_stream.next() {
Some('-') => {
self.state = TokenizerState::CommentEnd;
}
Some('>') => {
self.handle_error(TokenizerError::AbruptClosingOfEmptyComment)?;
self.state = TokenizerState::Data;
self.emit_current_comment_token()?;
}
None => {
self.handle_error(TokenizerError::EofInComment)?;
self.emit_current_comment_token()?;
self.emit(HtmlToken::EndOfFile)?;
}
_ => {
self.current_comment_token_mut()?.data.push('-');
self.reconsume_in_state(TokenizerState::Comment)?;
}
}
Ok(())
}
pub(super) fn comment_state(&mut self) -> Result<(), HtmlParseError> {
match self.input_stream.next() {
Some('<') => {
self.current_comment_token_mut()?.data.push('<');
self.state = TokenizerState::CommentLessThanSign;
}
Some('-') => {
self.state = TokenizerState::CommentEndDash;
}
Some(&chars::NULL) => {
self.handle_error(TokenizerError::UnexpectedNullCharacter)?;
self.current_comment_token_mut()?
.data
.push(chars::REPLACEMENT_CHARACTER);
}
None => {
self.handle_error(TokenizerError::EofInComment)?;
self.emit_current_comment_token()?;
self.emit(HtmlToken::EndOfFile)?;
}
Some(c) => {
let c = *c;
self.current_comment_token_mut()?.data.push(c);
}
}
Ok(())
}
pub(super) fn comment_less_than_sign_state(&mut self) -> Result<(), HtmlParseError> {
match self.input_stream.next() {
Some('!') => {
self.current_comment_token_mut()?.data.push('!');
self.state = TokenizerState::CommentLessThanSignBang;
}
Some('<') => {
self.current_comment_token_mut()?.data.push('<');
}
_ => {
self.reconsume_in_state(TokenizerState::Comment)?;
}
}
Ok(())
}
pub(super) fn comment_less_than_sign_bang_state(&mut self) -> Result<(), HtmlParseError> {
match self.input_stream.next() {
Some('-') => {
self.state = TokenizerState::CommentLessThanSignBangDash;
}
_ => {
self.reconsume_in_state(TokenizerState::Comment)?;
}
}
Ok(())
}
pub(super) fn comment_less_than_sign_bang_dash_state(&mut self) -> Result<(), HtmlParseError> {
match self.input_stream.next() {
Some('-') => {
self.state = TokenizerState::CommentLessThanSignBangDashDash;
}
_ => {
self.reconsume_in_state(TokenizerState::CommentEndDash)?;
}
}
Ok(())
}
pub(super) fn comment_less_than_sign_bang_dash_dash_state(
&mut self,
) -> Result<(), HtmlParseError> {
match self.input_stream.next() {
Some('>') => {
self.reconsume_in_state(TokenizerState::CommentEnd)?;
}
None => {
self.reconsume_in_state(TokenizerState::CommentEnd)?;
}
_ => {
self.handle_error(TokenizerError::NestedComment)?;
self.reconsume_in_state(TokenizerState::CommentEnd)?;
}
}
Ok(())
}
pub(super) fn comment_end_dash_state(&mut self) -> Result<(), HtmlParseError> {
match self.input_stream.next() {
Some('-') => {
self.state = TokenizerState::CommentEnd;
}
None => {
self.handle_error(TokenizerError::EofInComment)?;
self.emit_current_comment_token()?;
self.emit(HtmlToken::EndOfFile)?;
}
_ => {
self.current_comment_token_mut()?.data.push('-');
self.reconsume_in_state(TokenizerState::Comment)?;
}
}
Ok(())
}
pub(super) fn comment_end_state(&mut self) -> Result<(), HtmlParseError> {
match self.input_stream.next() {
Some('>') => {
self.state = TokenizerState::Data;
self.emit_current_comment_token()?;
}
Some('!') => {
self.state = TokenizerState::CommentEndBang;
}
Some('-') => {
self.current_comment_token_mut()?.data.push('-');
}
None => {
self.handle_error(TokenizerError::EofInComment)?;
self.emit_current_comment_token()?;
self.emit(HtmlToken::EndOfFile)?;
}
_ => {
self.current_comment_token_mut()?.data.push_str("--");
self.reconsume_in_state(TokenizerState::Comment)?;
}
}
Ok(())
}
pub(super) fn comment_end_bang_state(&mut self) -> Result<(), HtmlParseError> {
match self.input_stream.next() {
Some('-') => {
self.current_comment_token_mut()?.data.push_str("--!");
self.state = TokenizerState::CommentEndDash;
}
Some('>') => {
self.handle_error(TokenizerError::IncorrectlyClosedComment)?;
self.state = TokenizerState::Data;
self.emit_current_comment_token()?;
}
None => {
self.handle_error(TokenizerError::EofInComment)?;
self.emit_current_comment_token()?;
self.emit(HtmlToken::EndOfFile)?;
}
_ => {
self.current_comment_token_mut()?.data.push_str("--!");
self.reconsume_in_state(TokenizerState::Comment)?;
}
}
Ok(())
}
pub(super) fn doctype_state(&mut self) -> Result<(), HtmlParseError> {
match self.input_stream.next() {
Some(&chars::CHARACTER_TABULATION)
| Some(&chars::LINE_FEED)
| Some(&chars::FORM_FEED)
| Some(&chars::SPACE) => {
self.state = TokenizerState::BeforeDOCTYPEName;
}
Some('>') => {
self.reconsume_in_state(TokenizerState::BeforeDOCTYPEName)?;
}
None => {
self.handle_error(TokenizerError::EofInDoctype)?;
let mut doctype_token = DoctypeToken::new(String::new());
doctype_token.force_quirks = true;
self.doctype_token = Some(doctype_token);
self.emit_current_token()?;
self.emit(HtmlToken::EndOfFile)?;
}
Some(_) => {
self.handle_error(TokenizerError::MissingWhitespaceBeforeDoctypeName)?;
self.reconsume_in_state(TokenizerState::BeforeDOCTYPEName)?;
}
}
Ok(())
}
pub(super) fn before_doctype_name(&mut self) -> Result<(), HtmlParseError> {
match self.input_stream.next() {
Some(&chars::CHARACTER_TABULATION)
| Some(&chars::LINE_FEED)
| Some(&chars::FORM_FEED)
| Some(&chars::SPACE) => {
}
Some(c) if c.is_ascii_uppercase() => {
self.doctype_token = Some(DoctypeToken::new(String::from(c.to_ascii_lowercase())));
self.state = TokenizerState::DOCTYPEName;
}
Some(&chars::NULL) => {
self.handle_error(TokenizerError::UnexpectedNullCharacter)?;
self.doctype_token = Some(DoctypeToken::new(String::from(
chars::REPLACEMENT_CHARACTER,
)));
self.state = TokenizerState::DOCTYPEName;
}
Some('>') => {
self.handle_error(TokenizerError::MissingDoctypeName)?;
let mut doctype_token = DoctypeToken::new(String::new());
doctype_token.force_quirks = true;
self.doctype_token = Some(doctype_token);
self.emit_current_token()?;
self.state = TokenizerState::Data;
}
None => {
self.handle_error(TokenizerError::EofInDoctype)?;
let mut doctype_token = DoctypeToken::new(String::new());
doctype_token.force_quirks = true;
self.doctype_token = Some(doctype_token);
self.emit_current_token()?;
self.emit(HtmlToken::EndOfFile)?;
}
Some(c) => {
self.doctype_token = Some(DoctypeToken::new(String::from(*c)));
self.state = TokenizerState::DOCTYPEName;
}
}
Ok(())
}
pub(super) fn doctype_name_state(&mut self) -> Result<(), HtmlParseError> {
match self.input_stream.next() {
Some(&chars::CHARACTER_TABULATION)
| Some(&chars::LINE_FEED)
| Some(&chars::FORM_FEED)
| Some(&chars::SPACE) => {
self.state = TokenizerState::AfterDOCTYPEName;
}
Some('>') => {
self.emit_current_doctype_token()?;
self.state = TokenizerState::Data;
}
Some(c) if c.is_ascii_uppercase() => {
let c = *c;
self.current_doctype_token_mut()?
.name
.push(c.to_ascii_lowercase());
}
Some(&chars::NULL) => {
self.handle_error(TokenizerError::UnexpectedNullCharacter)?;
self.current_doctype_token_mut()?
.name
.push(chars::REPLACEMENT_CHARACTER);
}
None => {
self.handle_error(TokenizerError::EofInDoctype)?;
self.current_doctype_token_mut()?.force_quirks = true;
self.emit_current_doctype_token()?;
self.emit(HtmlToken::EndOfFile)?;
}
Some(c) => {
let c = *c;
self.current_doctype_token_mut()?.name.push(c);
}
}
Ok(())
}
pub(super) fn after_doctype_name_state(&mut self) -> Result<(), HtmlParseError> {
match self.input_stream.next() {
Some(&chars::CHARACTER_TABULATION)
| Some(&chars::LINE_FEED)
| Some(&chars::FORM_FEED)
| Some(&chars::SPACE) => {
}
Some('>') => {
self.emit_current_doctype_token()?;
self.state = TokenizerState::Data;
}
None => {
self.handle_error(TokenizerError::EofInDoctype)?;
self.current_doctype_token_mut()?.force_quirks = true;
self.emit_current_doctype_token()?;
self.emit(HtmlToken::EndOfFile)?;
}
Some(_) => {
self.input_stream.prev();
let public_target = ['P', 'U', 'B', 'L', 'I', 'C'];
if public_target.iter().enumerate().all(|(i, target)| {
self.input_stream
.peek_add(i)
.is_some_and(|c| c.eq_ignore_ascii_case(target))
}) {
self.input_stream.next_add(6);
self.state = TokenizerState::AfterDOCTYPEPublicKeyword;
return Ok(());
}
let system_target = ['S', 'Y', 'S', 'T', 'E', 'M'];
if system_target.iter().enumerate().all(|(i, target)| {
self.input_stream
.peek_add(i)
.is_some_and(|c| c.eq_ignore_ascii_case(target))
}) {
self.input_stream.next_add(6);
self.state = TokenizerState::AfterDOCTYPESystemKeyword;
return Ok(());
}
self.handle_error(TokenizerError::InvalidCharacterSequenceAfterDoctypeName)?;
self.current_doctype_token_mut()?.force_quirks = true;
self.reconsume_in_state(TokenizerState::BogusDOCTYPE)?;
}
}
Ok(())
}
pub(super) fn after_doctype_public_keyword_state(&mut self) -> Result<(), HtmlParseError> {
match self.input_stream.next() {
Some(&chars::CHARACTER_TABULATION)
| Some(&chars::LINE_FEED)
| Some(&chars::FORM_FEED)
| Some(&chars::SPACE) => {
self.state = TokenizerState::BeforeDOCTYPEPublicIdentifier;
}
Some('"') => {
self.handle_error(TokenizerError::MissingWhitespaceAfterDoctypePublicKeyword)?;
self.current_doctype_token_mut()?.public_identifier = Some(String::new());
self.state = TokenizerState::DOCTYPEPublicIdentifierDoubleQuoted;
}
Some('\'') => {
self.handle_error(TokenizerError::MissingWhitespaceAfterDoctypePublicKeyword)?;
self.current_doctype_token_mut()?.public_identifier = Some(String::new());
self.state = TokenizerState::DOCTYPEPublicIdentifierSingleQuoted;
}
Some('>') => {
self.handle_error(TokenizerError::MissingDoctypePublicIdentifier)?;
self.current_doctype_token_mut()?.force_quirks = true;
self.emit_current_doctype_token()?;
self.state = TokenizerState::Data;
}
None => {
self.handle_error(TokenizerError::EofInDoctype)?;
self.current_doctype_token_mut()?.force_quirks = true;
self.emit_current_doctype_token()?;
self.emit(HtmlToken::EndOfFile)?;
}
_ => {
self.handle_error(TokenizerError::MissingQuoteBeforeDoctypePublicIdentifier)?;
self.current_doctype_token_mut()?.force_quirks = true;
self.reconsume_in_state(TokenizerState::BogusDOCTYPE)?;
}
}
Ok(())
}
pub(super) fn before_doctype_public_identifier_state(&mut self) -> Result<(), HtmlParseError> {
match self.input_stream.next() {
Some(&chars::CHARACTER_TABULATION)
| Some(&chars::LINE_FEED)
| Some(&chars::FORM_FEED)
| Some(&chars::SPACE) => {
}
Some('"') => {
self.current_doctype_token_mut()?.public_identifier = Some(String::new());
self.state = TokenizerState::DOCTYPEPublicIdentifierDoubleQuoted;
}
Some('\'') => {
self.current_doctype_token_mut()?.public_identifier = Some(String::new());
self.state = TokenizerState::DOCTYPEPublicIdentifierSingleQuoted;
}
Some('>') => {
self.handle_error(TokenizerError::MissingDoctypePublicIdentifier)?;
self.current_doctype_token_mut()?.force_quirks = true;
self.emit_current_doctype_token()?;
self.state = TokenizerState::Data;
}
None => {
self.handle_error(TokenizerError::EofInDoctype)?;
self.current_doctype_token_mut()?.force_quirks = true;
self.emit_current_doctype_token()?;
self.emit(HtmlToken::EndOfFile)?;
}
_ => {
self.handle_error(TokenizerError::MissingQuoteBeforeDoctypePublicIdentifier)?;
self.current_doctype_token_mut()?.force_quirks = true;
self.reconsume_in_state(TokenizerState::BogusDOCTYPE)?;
}
}
Ok(())
}
pub(super) fn doctype_public_identifier_double_quoted_state(
&mut self,
) -> Result<(), HtmlParseError> {
match self.input_stream.next() {
Some('"') => {
self.state = TokenizerState::AfterDOCTYPEPublicIdentifier;
}
Some(&chars::NULL) => {
self.handle_error(TokenizerError::UnexpectedNullCharacter)?;
self.current_doctype_token_mut()?
.public_identifier
.get_or_insert_with(String::new)
.push(chars::REPLACEMENT_CHARACTER);
}
Some('>') => {
self.handle_error(TokenizerError::AbruptDoctypePublicIdentifier)?;
self.current_doctype_token_mut()?.force_quirks = true;
self.emit_current_doctype_token()?;
self.state = TokenizerState::Data;
}
None => {
self.handle_error(TokenizerError::EofInDoctype)?;
self.current_doctype_token_mut()?.force_quirks = true;
self.emit_current_doctype_token()?;
self.emit(HtmlToken::EndOfFile)?;
}
Some(c) => {
let c = *c;
self.current_doctype_token_mut()?
.public_identifier
.get_or_insert_with(String::new)
.push(c);
}
}
Ok(())
}
pub(super) fn doctype_public_identifier_single_quoted_state(
&mut self,
) -> Result<(), HtmlParseError> {
match self.input_stream.next() {
Some('\'') => {
self.state = TokenizerState::AfterDOCTYPEPublicIdentifier;
}
Some(&chars::NULL) => {
self.handle_error(TokenizerError::UnexpectedNullCharacter)?;
self.current_doctype_token_mut()?
.public_identifier
.get_or_insert_with(String::new)
.push(chars::REPLACEMENT_CHARACTER);
}
Some('>') => {
self.handle_error(TokenizerError::AbruptDoctypePublicIdentifier)?;
self.current_doctype_token_mut()?.force_quirks = true;
self.emit_current_doctype_token()?;
self.state = TokenizerState::Data;
}
None => {
self.handle_error(TokenizerError::EofInDoctype)?;
self.current_doctype_token_mut()?.force_quirks = true;
self.emit_current_doctype_token()?;
self.emit(HtmlToken::EndOfFile)?;
}
Some(c) => {
let c = *c;
self.current_doctype_token_mut()?
.public_identifier
.get_or_insert_with(String::new)
.push(c);
}
}
Ok(())
}
pub(super) fn after_doctype_public_identifier_state(&mut self) -> Result<(), HtmlParseError> {
match self.input_stream.next() {
Some(&chars::CHARACTER_TABULATION)
| Some(&chars::LINE_FEED)
| Some(&chars::FORM_FEED)
| Some(&chars::SPACE) => {
self.state = TokenizerState::BetweenDOCTYPEPublicAndSystemIdentifiers;
}
Some('>') => {
self.emit_current_doctype_token()?;
self.state = TokenizerState::Data;
}
Some('"') => {
self.handle_error(
TokenizerError::MissingWhitespaceBetweenDoctypePublicAndSystemIdentifiers,
)?;
self.current_doctype_token_mut()?.system_identifier = Some(String::new());
self.state = TokenizerState::DOCTYPESystemIdentifierDoubleQuoted;
}
Some('\'') => {
self.handle_error(
TokenizerError::MissingWhitespaceBetweenDoctypePublicAndSystemIdentifiers,
)?;
self.current_doctype_token_mut()?.system_identifier = Some(String::new());
self.state = TokenizerState::DOCTYPESystemIdentifierSingleQuoted;
}
None => {
self.handle_error(TokenizerError::EofInDoctype)?;
self.current_doctype_token_mut()?.force_quirks = true;
self.emit_current_doctype_token()?;
self.emit(HtmlToken::EndOfFile)?;
}
_ => {
self.handle_error(TokenizerError::MissingQuoteBeforeDoctypeSystemIdentifier)?;
self.current_doctype_token_mut()?.force_quirks = true;
self.reconsume_in_state(TokenizerState::BogusDOCTYPE)?;
}
}
Ok(())
}
pub(super) fn between_doctype_public_and_system_identifiers_state(
&mut self,
) -> Result<(), HtmlParseError> {
match self.input_stream.next() {
Some(&chars::CHARACTER_TABULATION)
| Some(&chars::LINE_FEED)
| Some(&chars::FORM_FEED)
| Some(&chars::SPACE) => {
}
Some('>') => {
self.emit_current_doctype_token()?;
self.state = TokenizerState::Data;
}
Some('"') => {
self.current_doctype_token_mut()?.system_identifier = Some(String::new());
self.state = TokenizerState::DOCTYPESystemIdentifierDoubleQuoted;
}
Some('\'') => {
self.current_doctype_token_mut()?.system_identifier = Some(String::new());
self.state = TokenizerState::DOCTYPESystemIdentifierSingleQuoted;
}
None => {
self.handle_error(TokenizerError::EofInDoctype)?;
self.current_doctype_token_mut()?.force_quirks = true;
self.emit_current_doctype_token()?;
self.emit(HtmlToken::EndOfFile)?;
}
_ => {
self.handle_error(TokenizerError::MissingQuoteBeforeDoctypeSystemIdentifier)?;
self.current_doctype_token_mut()?.force_quirks = true;
self.reconsume_in_state(TokenizerState::BogusDOCTYPE)?;
}
}
Ok(())
}
pub(super) fn after_doctype_system_keyword_state(&mut self) -> Result<(), HtmlParseError> {
match self.input_stream.next() {
Some(&chars::CHARACTER_TABULATION)
| Some(&chars::LINE_FEED)
| Some(&chars::FORM_FEED)
| Some(&chars::SPACE) => {
self.state = TokenizerState::BeforeDOCTYPESystemIdentifier;
}
Some('"') => {
self.handle_error(TokenizerError::MissingWhitespaceAfterDoctypeSystemKeyword)?;
self.current_doctype_token_mut()?.system_identifier = Some(String::new());
self.state = TokenizerState::DOCTYPESystemIdentifierDoubleQuoted;
}
Some('\'') => {
self.handle_error(TokenizerError::MissingWhitespaceAfterDoctypeSystemKeyword)?;
self.current_doctype_token_mut()?.system_identifier = Some(String::new());
self.state = TokenizerState::DOCTYPESystemIdentifierSingleQuoted;
}
Some('>') => {
self.handle_error(TokenizerError::MissingDoctypeSystemIdentifier)?;
self.current_doctype_token_mut()?.force_quirks = true;
self.emit_current_doctype_token()?;
self.state = TokenizerState::Data;
}
None => {
self.handle_error(TokenizerError::EofInDoctype)?;
self.current_doctype_token_mut()?.force_quirks = true;
self.emit_current_doctype_token()?;
self.emit(HtmlToken::EndOfFile)?;
}
_ => {
self.handle_error(TokenizerError::MissingQuoteBeforeDoctypeSystemIdentifier)?;
self.current_doctype_token_mut()?.force_quirks = true;
self.reconsume_in_state(TokenizerState::BogusDOCTYPE)?;
}
}
Ok(())
}
pub(super) fn before_doctype_system_identifier_state(&mut self) -> Result<(), HtmlParseError> {
match self.input_stream.next() {
Some(&chars::CHARACTER_TABULATION)
| Some(&chars::LINE_FEED)
| Some(&chars::FORM_FEED)
| Some(&chars::SPACE) => {
}
Some('"') => {
self.current_doctype_token_mut()?.system_identifier = Some(String::new());
self.state = TokenizerState::DOCTYPESystemIdentifierDoubleQuoted;
}
Some('\'') => {
self.current_doctype_token_mut()?.system_identifier = Some(String::new());
self.state = TokenizerState::DOCTYPESystemIdentifierSingleQuoted;
}
Some('>') => {
self.handle_error(TokenizerError::MissingDoctypeSystemIdentifier)?;
self.current_doctype_token_mut()?.force_quirks = true;
self.emit_current_doctype_token()?;
self.state = TokenizerState::Data;
}
None => {
self.handle_error(TokenizerError::EofInDoctype)?;
self.current_doctype_token_mut()?.force_quirks = true;
self.emit_current_doctype_token()?;
self.emit(HtmlToken::EndOfFile)?;
}
_ => {
self.handle_error(TokenizerError::MissingQuoteBeforeDoctypeSystemIdentifier)?;
self.current_doctype_token_mut()?.force_quirks = true;
self.reconsume_in_state(TokenizerState::BogusDOCTYPE)?;
}
}
Ok(())
}
pub(super) fn doctype_system_identifier_double_quoted_state(
&mut self,
) -> Result<(), HtmlParseError> {
match self.input_stream.next() {
Some('"') => {
self.state = TokenizerState::AfterDOCTYPESystemIdentifier;
}
Some(&chars::NULL) => {
self.handle_error(TokenizerError::UnexpectedNullCharacter)?;
self.current_doctype_token_mut()?
.system_identifier
.get_or_insert_with(String::new)
.push(chars::REPLACEMENT_CHARACTER);
}
Some('>') => {
self.handle_error(TokenizerError::AbruptDoctypeSystemIdentifier)?;
self.current_doctype_token_mut()?.force_quirks = true;
self.emit_current_doctype_token()?;
self.state = TokenizerState::Data;
}
None => {
self.handle_error(TokenizerError::EofInDoctype)?;
self.current_doctype_token_mut()?.force_quirks = true;
self.emit_current_doctype_token()?;
self.emit(HtmlToken::EndOfFile)?;
}
Some(c) => {
let c = *c;
self.current_doctype_token_mut()?
.system_identifier
.get_or_insert_with(String::new)
.push(c);
}
}
Ok(())
}
pub(super) fn doctype_system_identifier_single_quoted_state(
&mut self,
) -> Result<(), HtmlParseError> {
match self.input_stream.next() {
Some('\'') => {
self.state = TokenizerState::AfterDOCTYPESystemIdentifier;
}
Some(&chars::NULL) => {
self.handle_error(TokenizerError::UnexpectedNullCharacter)?;
self.current_doctype_token_mut()?
.system_identifier
.get_or_insert_with(String::new)
.push(chars::REPLACEMENT_CHARACTER);
}
Some('>') => {
self.handle_error(TokenizerError::AbruptDoctypeSystemIdentifier)?;
self.current_doctype_token_mut()?.force_quirks = true;
self.emit_current_doctype_token()?;
self.state = TokenizerState::Data;
}
None => {
self.handle_error(TokenizerError::EofInDoctype)?;
self.current_doctype_token_mut()?.force_quirks = true;
self.emit_current_doctype_token()?;
self.emit(HtmlToken::EndOfFile)?;
}
Some(c) => {
let c = *c;
self.current_doctype_token_mut()?
.system_identifier
.get_or_insert_with(String::new)
.push(c);
}
}
Ok(())
}
pub(super) fn after_doctype_system_identifier_state(&mut self) -> Result<(), HtmlParseError> {
match self.input_stream.next() {
Some(&chars::CHARACTER_TABULATION)
| Some(&chars::LINE_FEED)
| Some(&chars::FORM_FEED)
| Some(&chars::SPACE) => {
}
Some('>') => {
self.emit_current_doctype_token()?;
self.state = TokenizerState::Data;
}
None => {
self.handle_error(TokenizerError::EofInDoctype)?;
self.current_doctype_token_mut()?.force_quirks = true;
self.emit_current_doctype_token()?;
self.emit(HtmlToken::EndOfFile)?;
}
_ => {
self.handle_error(TokenizerError::UnexpectedCharacterAfterDoctypeSystemIdentifier)?;
self.reconsume_in_state(TokenizerState::BogusDOCTYPE)?;
}
}
Ok(())
}
pub(super) fn bogus_doctype_state(&mut self) -> Result<(), HtmlParseError> {
match self.input_stream.next() {
Some('>') => {
self.emit_current_doctype_token()?;
self.state = TokenizerState::Data;
}
Some(&chars::NULL) => {
self.handle_error(TokenizerError::UnexpectedNullCharacter)?;
}
None => {
self.emit_current_doctype_token()?;
self.emit(HtmlToken::EndOfFile)?;
}
_ => {
}
}
Ok(())
}
pub(super) fn character_reference_state(&mut self) -> Result<(), HtmlParseError> {
fn anything_else(tokenizer: &mut Tokenizer) -> Result<(), HtmlParseError> {
tokenizer.flush_code_points_consumed_as_character_reference()?;
tokenizer.reconsume_in_state(tokenizer.current_return_state()?)
}
self.temporary_buffer.clear();
self.temporary_buffer.push('&');
match self.input_stream.next() {
Some(c) => match c {
c if c.is_ascii_alphanumeric() => {
self.reconsume_in_state(TokenizerState::NamedCharacterReference)?;
}
'#' => {
self.temporary_buffer.push('#');
self.state = TokenizerState::NumericCharacterReference;
}
_ => {
anything_else(self)?;
}
},
None => {
anything_else(self)?;
}
};
Ok(())
}
pub(super) fn named_character_reference_state(&mut self) -> Result<(), HtmlParseError> {
fn historical_reasons(tokenizer: &mut Tokenizer) -> Result<(), HtmlParseError> {
tokenizer.flush_code_points_consumed_as_character_reference()?;
tokenizer.state = tokenizer.current_return_state()?;
Ok(())
}
let mut key_buf = String::with_capacity(NAMED_CHARACTER_REFS_MAX_LENGTH + 1);
key_buf.push('&');
let mut best_match_len: usize = 0;
let mut best_match_value: Option<&str> = None;
let mut peek_offset: usize = 0;
loop {
let next_char = match self.input_stream.peek_add(peek_offset) {
Some(&c) => c,
None => break,
};
key_buf.push(next_char);
if let Some(&value) = NAMED_CHARACTER_REFS.get(key_buf.as_str()) {
best_match_len = key_buf.len();
best_match_value = Some(value);
if next_char == ';' {
break;
}
}
peek_offset += 1;
if key_buf.len() > NAMED_CHARACTER_REFS_MAX_LENGTH {
break;
}
}
if best_match_len > 0 {
key_buf.truncate(best_match_len);
let ends_with_semi = key_buf.as_bytes().last() == Some(&b';');
self.input_stream.next_add(best_match_len - 1);
if !ends_with_semi {
self.temporary_buffer.clear();
for code_point in key_buf.chars() {
self.temporary_buffer.push(code_point);
}
if self.charref_in_attribute() {
if let Some(c) = self.input_stream.current() {
match c {
'=' => {
historical_reasons(self)?;
return Ok(());
}
c if c.is_ascii_alphanumeric() => {
historical_reasons(self)?;
return Ok(());
}
_ => {}
}
}
}
self.handle_error(TokenizerError::MissingSemicolonAfterCharacterReference)?;
}
self.temporary_buffer.clear();
let char_ref_characters = best_match_value.unwrap();
for code_point in char_ref_characters.chars() {
self.temporary_buffer.push(code_point);
}
self.flush_code_points_consumed_as_character_reference()?;
self.state = self.current_return_state()?;
} else {
self.flush_code_points_consumed_as_character_reference()?;
self.state = TokenizerState::AmbiguousAmpersand;
}
Ok(())
}
pub(super) fn ambiguous_ampersand_state(&mut self) -> Result<(), HtmlParseError> {
match self.input_stream.next() {
Some(c) if c.is_ascii_alphanumeric() => {
let c = *c;
if self.charref_in_attribute() {
self.current_attribute_mut()?.value.push(c);
} else {
self.emit(HtmlToken::Character(c))?;
}
}
Some(';') => {
self.handle_error(TokenizerError::UnknownNamedCharacterReference)?;
self.reconsume_in_state(self.current_return_state()?)?;
}
_ => {
self.reconsume_in_state(self.current_return_state()?)?;
}
};
Ok(())
}
pub(super) fn numeric_character_reference_state(&mut self) -> Result<(), HtmlParseError> {
self.character_reference_code = 0;
match self.input_stream.next() {
Some(c) if [chars::LATIN_SMALL_LETTER_X, chars::LATIN_CAPITAL_LETTER_X].contains(c) => {
self.temporary_buffer.push(*c);
self.state = TokenizerState::HexadecimalCharacterReferenceStart;
}
_ => {
self.reconsume_in_state(TokenizerState::DecimalCharacterReferenceStart)?;
}
}
Ok(())
}
pub(super) fn decimal_character_reference_start_state(&mut self) -> Result<(), HtmlParseError> {
match self.input_stream.next() {
Some(c) if c.is_ascii_digit() => {
self.reconsume_in_state(TokenizerState::DecimalCharacterReference)?;
}
_ => {
self.handle_error(TokenizerError::AbsenceOfDigitsInNumericCharacterReference)?;
self.flush_code_points_consumed_as_character_reference()?;
self.reconsume_in_state(self.current_return_state()?)?;
}
}
Ok(())
}
pub(super) fn decimal_character_reference_state(&mut self) -> Result<(), HtmlParseError> {
match self.input_stream.next() {
Some(c) if c.is_ascii_digit() => {
self.character_reference_code = self.character_reference_code.saturating_mul(10);
self.character_reference_code = self.character_reference_code.saturating_add(
c.to_digit(10)
.ok_or_else(|| HtmlParseError::new("decimal character not a digit"))?,
);
}
Some(';') => {
self.state = TokenizerState::NumericCharacterReferenceEnd;
}
_ => {
self.handle_error(TokenizerError::MissingSemicolonAfterCharacterReference)?;
self.reconsume_in_state(TokenizerState::NumericCharacterReferenceEnd)?;
}
}
Ok(())
}
pub(super) fn numeric_character_reference_end_state(&mut self) -> Result<(), HtmlParseError> {
if self.character_reference_code == 0x00 {
self.handle_error(TokenizerError::NullCharacterReference)?;
self.character_reference_code = 0xFFFD;
} else if self.character_reference_code > 0x10FFFF {
self.handle_error(TokenizerError::CharacterReferenceOutsideUnicodeRange)?;
self.character_reference_code = 0xFFFD;
} else if is_surrogate(self.character_reference_code) {
self.handle_error(TokenizerError::SurrogateCharacterReference)?;
self.character_reference_code = 0xFFFD;
} else if is_noncharacter(self.character_reference_code) {
self.handle_error(TokenizerError::NoncharacterCharacterReference)?;
} else if self.character_reference_code == 0x0D
|| (is_control(self.character_reference_code)
&& !is_ascii_whitespace(self.character_reference_code))
{
self.handle_error(TokenizerError::ControlCharacterReference)?;
if let Some(num) = NUMERIC_CHARACTER_REF_END_TABLE.get(&self.character_reference_code) {
self.character_reference_code = *num;
}
}
self.temporary_buffer.clear();
self.temporary_buffer
.push(std::char::from_u32(self.character_reference_code).unwrap_or(chars::REPLACEMENT_CHARACTER));
self.flush_code_points_consumed_as_character_reference()?;
self.state = self.current_return_state()?;
Ok(())
}
pub(super) fn plaintext_state(&mut self) -> Result<(), HtmlParseError> {
match self.input_stream.next() {
Some(&chars::NULL) => {
self.handle_error(TokenizerError::UnexpectedNullCharacter)?;
self.emit(HtmlToken::Character(chars::REPLACEMENT_CHARACTER))?;
}
Some(c) => {
let first = *c;
let rest = self.input_stream.consume_while(|c| *c != '\0');
if rest.is_empty() {
self.emit(HtmlToken::Character(first))?;
} else {
let mut batch = String::with_capacity(1 + rest.len());
batch.push(first);
batch.extend(rest.iter());
self.emit(HtmlToken::Characters(batch))?;
}
}
None => self.emit(HtmlToken::EndOfFile)?,
};
Ok(())
}
pub(super) fn rawtext_less_than_sign_state(&mut self) -> Result<(), HtmlParseError> {
match self.input_stream.next() {
Some('/') => {
self.temporary_buffer.clear();
self.state = TokenizerState::RAWTEXTEndTagOpen;
}
_ => {
self.emit(HtmlToken::Character('<'))?;
self.reconsume_in_state(TokenizerState::RAWTEXT)?;
}
}
Ok(())
}
pub(super) fn rawtext_end_tag_open_state(&mut self) -> Result<(), HtmlParseError> {
match self.input_stream.next() {
Some(c) if c.is_ascii_alphabetic() => {
self.tag_token = Some(TagTokenType::EndTag(TagToken::new(String::new())));
self.reconsume_in_state(TokenizerState::RAWTEXTEndTagName)?;
}
_ => {
self.emit(HtmlToken::Character('<'))?;
self.emit(HtmlToken::Character('/'))?;
self.reconsume_in_state(TokenizerState::RAWTEXT)?;
}
}
Ok(())
}
pub(super) fn rawtext_end_tag_name_state(&mut self) -> Result<(), HtmlParseError> {
fn anything_else(tokenizer: &mut Tokenizer) -> Result<(), HtmlParseError> {
tokenizer.emit(HtmlToken::Character('<'))?;
tokenizer.emit(HtmlToken::Character('/'))?;
let chars: Vec<char> = tokenizer.temporary_buffer.drain(..).collect();
for c in chars.into_iter() {
tokenizer.emit(HtmlToken::Character(c))?;
}
tokenizer.reconsume_in_state(TokenizerState::RAWTEXT)?;
Ok(())
}
match self.input_stream.next() {
Some(
&chars::CHARACTER_TABULATION
| &chars::LINE_FEED
| &chars::FORM_FEED
| &chars::SPACE,
) => {
if self.is_current_end_tag_token_appropriate() {
self.state = TokenizerState::BeforeAttributeName;
return Ok(());
}
anything_else(self)?;
}
Some('/') => {
if self.is_current_end_tag_token_appropriate() {
self.state = TokenizerState::SelfClosingStartTag;
return Ok(());
}
anything_else(self)?;
}
Some('>') => {
if self.is_current_end_tag_token_appropriate() {
self.state = TokenizerState::Data;
self.emit_current_tag_token()?;
return Ok(());
}
anything_else(self)?;
}
Some(c) if c.is_ascii_uppercase() => {
let c = *c;
let lowercase = c.to_ascii_lowercase();
self.current_tag_token_mut()?.tag_name_mut().push(lowercase);
self.temporary_buffer.push(c);
}
Some(c) if c.is_ascii_lowercase() => {
let c = *c;
self.current_tag_token_mut()?.tag_name_mut().push(c);
self.temporary_buffer.push(c);
}
_ => anything_else(self)?,
}
Ok(())
}
pub(super) fn cdata_section_state(&mut self) -> Result<(), HtmlParseError> {
match self.input_stream.next() {
Some(']') => {
self.state = TokenizerState::CDATASectionBracket;
}
Some(c) => {
let current_input_character = *c;
self.emit(HtmlToken::Character(current_input_character))?;
}
None => {
self.handle_error(TokenizerError::EofInCdataSection)?;
self.emit(HtmlToken::EndOfFile)?;
}
}
Ok(())
}
pub(super) fn cdata_section_bracket_state(&mut self) -> Result<(), HtmlParseError> {
match self.input_stream.next() {
Some(']') => {
self.state = TokenizerState::CDATASectionEnd;
}
_ => {
self.emit(HtmlToken::Character(']'))?;
self.reconsume_in_state(TokenizerState::CDATASection)?;
}
}
Ok(())
}
pub(super) fn cdata_section_end_state(&mut self) -> Result<(), HtmlParseError> {
match self.input_stream.next() {
Some(']') => {
self.emit(HtmlToken::Character(']'))?;
}
Some('>') => {
self.state = TokenizerState::Data;
}
_ => {
self.emit(HtmlToken::Character(']'))?;
self.emit(HtmlToken::Character(']'))?;
self.reconsume_in_state(TokenizerState::CDATASection)?;
}
}
Ok(())
}
pub(super) fn hexadecimal_character_reference_start_state(
&mut self,
) -> Result<(), HtmlParseError> {
match self.input_stream.next() {
Some(c) if c.is_ascii_hexdigit() => {
self.reconsume_in_state(TokenizerState::HexadecimalCharacterReference)?;
}
_ => {
self.handle_error(TokenizerError::AbsenceOfDigitsInNumericCharacterReference)?;
self.flush_code_points_consumed_as_character_reference()?;
self.reconsume_in_state(self.current_return_state()?)?;
}
}
Ok(())
}
pub(super) fn hexadecimal_character_reference_state(
&mut self,
) -> Result<(), HtmlParseError> {
match self.input_stream.next() {
Some(c) if c.is_ascii_hexdigit() => {
self.character_reference_code = self.character_reference_code.saturating_mul(16);
self.character_reference_code = self.character_reference_code.saturating_add(
c.to_digit(16)
.ok_or_else(|| HtmlParseError::new("hex character not a hex digit"))?,
);
}
Some(';') => {
self.state = TokenizerState::NumericCharacterReferenceEnd;
}
_ => {
self.handle_error(TokenizerError::MissingSemicolonAfterCharacterReference)?;
self.reconsume_in_state(TokenizerState::NumericCharacterReferenceEnd)?;
}
}
Ok(())
}
}
fn is_surrogate(code_point: u32) -> bool {
is_leading_surrogate(code_point) || is_trailing_surrogate(code_point)
}
fn is_leading_surrogate(code_point: u32) -> bool {
(0xD800..=0xDBFF).contains(&code_point)
}
fn is_trailing_surrogate(code_point: u32) -> bool {
(0xDC00..=0xDFFF).contains(&code_point)
}
fn is_noncharacter(code_point: u32) -> bool {
(0xFDD0..=0xFDEF).contains(&code_point)
|| [
0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF, 0x4FFFE, 0x4FFFF,
0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE, 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE,
0x9FFFF, 0xAFFFE, 0xAFFFF, 0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF,
0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF,
]
.contains(&code_point)
}
fn is_control(code_point: u32) -> bool {
is_c0_control(code_point) || (0x007F..=0x009F).contains(&code_point)
}
fn is_c0_control(code_point: u32) -> bool {
code_point <= 0x001F
}
fn is_ascii_whitespace(code_point: u32) -> bool {
code_point == 0x0009
|| code_point == 0x000A
|| code_point == 0x000C
|| code_point == 0x000D
|| code_point == 0x0020
}
static NUMERIC_CHARACTER_REF_END_TABLE: LazyLock<HashMap<u32, u32>> = LazyLock::new(|| {
let mut table = HashMap::new();
table.insert(0x80, 0x20AC);
table.insert(0x82, 0x201A);
table.insert(0x83, 0x0192);
table.insert(0x84, 0x201E);
table.insert(0x85, 0x2026);
table.insert(0x86, 0x2020);
table.insert(0x87, 0x2021);
table.insert(0x88, 0x02C6);
table.insert(0x89, 0x2030);
table.insert(0x8A, 0x0160);
table.insert(0x8B, 0x2039);
table.insert(0x8C, 0x0152);
table.insert(0x8E, 0x017D);
table.insert(0x91, 0x2018);
table.insert(0x92, 0x2019);
table.insert(0x93, 0x201C);
table.insert(0x94, 0x201D);
table.insert(0x95, 0x2022);
table.insert(0x96, 0x2013);
table.insert(0x97, 0x2014);
table.insert(0x98, 0x02DC);
table.insert(0x99, 0x2122);
table.insert(0x9A, 0x0161);
table.insert(0x9B, 0x203A);
table.insert(0x9C, 0x0153);
table.insert(0x9E, 0x017E);
table.insert(0x9F, 0x0178);
table
});