use crate::{
naive_parser::naive_next_state,
offset::{Offset, Position},
reader::Reader,
Emitter, Error,
};
use super::Machine;
impl<R, O, E> Machine<R, O, E>
where
R: Reader + Position<O>,
O: Offset,
E: Emitter<O>,
{
pub(crate) fn reader_position(&self) -> O {
self.reader.position()
}
#[inline]
pub(super) fn emit_char_for_source_char(&mut self, c: char, source_char: char) {
let pos = self.reader.position();
self.emitter.emit_char(
c,
pos - self.reader.len_of_char_in_current_encoding(source_char)..pos,
);
}
#[inline]
pub(super) fn emit_char(&mut self, c: char) {
self.emit_char_for_source_char(c, c);
}
#[inline]
pub(super) fn emit_chars(&mut self, s: &[u8]) {
let mut start = self.some_offset;
for c in s {
let end = start + self.reader.len_of_char_in_current_encoding(*c as char);
self.emitter.emit_char(*c as char, start..end);
start = end;
}
}
#[inline]
pub(crate) fn emit_error(&mut self, error: Error) {
let span = match error {
Error::EofBeforeTagName
| Error::EofInCdata
| Error::EofInComment
| Error::EofInDoctype
| Error::EofInScriptHtmlCommentLikeText
| Error::EofInTag
| Error::MissingSemicolonAfterCharacterReference => {
self.reader.position()..self.reader.position()
}
Error::AbsenceOfDigitsInNumericCharacterReference
| Error::NullCharacterReference
| Error::CharacterReferenceOutsideUnicodeRange
| Error::SurrogateCharacterReference
| Error::NoncharacterCharacterReference
| Error::ControlCharacterReference
| Error::UnknownNamedCharacterReference => self.some_offset..self.reader.position(),
_ => self.position_before_match..self.reader.position(),
};
self.emitter.report_error(error, span);
}
#[inline]
pub(super) fn current_end_tag_is_appropriate(&mut self) -> bool {
self.current_tag_name == self.last_start_tag_name
}
#[inline]
pub(super) fn init_start_tag(&mut self) {
self.emitter
.init_start_tag(self.some_offset, self.position_before_match);
self.current_tag_name.clear();
self.is_start_tag = true;
}
#[inline]
pub(super) fn init_end_tag(&mut self) {
self.emitter
.init_end_tag(self.some_offset, self.position_before_match);
self.current_tag_name.clear();
self.is_start_tag = false;
}
#[inline]
pub(super) fn init_doctype(&mut self) {
self.emitter.init_doctype(self.some_offset);
}
#[inline]
pub(super) fn push_tag_name(&mut self, s: &str) {
self.emitter.push_tag_name(s);
self.current_tag_name.push_str(s);
}
#[inline]
pub(super) fn emit_current_tag(&mut self) {
self.emitter.emit_current_tag(self.reader.position());
if self.is_start_tag {
if self.naively_switch_state {
self.state = naive_next_state(&self.current_tag_name).into();
}
std::mem::swap(&mut self.last_start_tag_name, &mut self.current_tag_name);
}
}
#[inline]
pub(super) fn unread_char(&mut self, c: Option<char>) {
self.to_reconsume.push(c);
}
#[inline]
fn validate_char(&mut self, c: char) {
match c as u32 {
surrogate_pat!() => {
self.emit_error(Error::SurrogateInInputStream);
}
noncharacter_pat!() => {
self.emit_error(Error::NoncharacterInInputStream);
}
x @ control_pat!()
if !matches!(x, 0x0000 | 0x0009 | 0x000a | 0x000c | 0x000d | 0x0020) =>
{
self.emit_error(Error::ControlCharacterInInputStream);
}
_ => (),
}
}
pub(super) fn read_char(&mut self) -> Result<Option<char>, R::Error> {
let (c_res, reconsumed) = match self.to_reconsume.pop() {
Some(c) => (Ok(c), true),
None => (self.reader.read_char(), false),
};
let mut c = match c_res {
Ok(Some(c)) => c,
res => return res,
};
if c == '\r' {
c = '\n';
let c2 = self.reader.read_char()?;
if c2 != Some('\n') {
self.unread_char(c2);
}
}
if !reconsumed {
self.validate_char(c);
}
Ok(Some(c))
}
#[inline]
pub(super) fn try_read_string(
&mut self,
mut s: &str,
case_sensitive: bool,
) -> Result<bool, R::Error> {
debug_assert!(!s.is_empty());
let to_reconsume_bak = self.to_reconsume;
let mut chars = s.chars();
while let Some(c) = self.to_reconsume.pop() {
if let (Some(x), Some(x2)) = (c, chars.next()) {
if x == x2 || (!case_sensitive && x.to_ascii_lowercase() == x2.to_ascii_lowercase())
{
s = &s[x.len_utf8()..];
continue;
}
}
self.to_reconsume = to_reconsume_bak;
return Ok(false);
}
self.reader.try_read_string(s, case_sensitive)
}
pub(super) fn is_consumed_as_part_of_an_attribute(&self) -> bool {
matches!(
self.return_state,
Some(
State::AttributeValueDoubleQuoted
| State::AttributeValueSingleQuoted
| State::AttributeValueUnquoted
)
)
}
pub(super) fn flush_code_points_consumed_as_character_reference(&mut self) {
if self.is_consumed_as_part_of_an_attribute() {
self.emitter.push_attribute_value(&self.temporary_buffer);
self.temporary_buffer.clear();
} else {
self.flush_buffer_characters();
}
}
pub(super) fn flush_buffer_characters(&mut self) {
let temporary_buffer = std::mem::take(&mut self.temporary_buffer);
self.emit_chars(temporary_buffer.as_bytes());
}
}
macro_rules! surrogate_pat {
() => {
0xd800..=0xdfff
};
}
pub(crate) use surrogate_pat;
macro_rules! control_pat {
() => (0x0d | 0x0000..=0x001f | 0x007f..=0x009f)
}
pub(crate) use control_pat;
macro_rules! ascii_digit_pat {
() => {
'0'..='9'
};
}
pub(crate) use ascii_digit_pat;
macro_rules! whitespace_pat {
() => {
'\t' | '\u{0A}' | '\u{0C}' | ' '
};
}
pub(crate) use whitespace_pat;
macro_rules! noncharacter_pat {
() => {
0xfdd0
..=0xfdef
| 0xfffe
| 0xffff
| 0x1fffe
| 0x1ffff
| 0x2fffe
| 0x2ffff
| 0x3fffe
| 0x3ffff
| 0x4fffe
| 0x4ffff
| 0x5fffe
| 0x5ffff
| 0x6fffe
| 0x6ffff
| 0x7fffe
| 0x7ffff
| 0x8fffe
| 0x8ffff
| 0x9fffe
| 0x9ffff
| 0xafffe
| 0xaffff
| 0xbfffe
| 0xbffff
| 0xcfffe
| 0xcffff
| 0xdfffe
| 0xdffff
| 0xefffe
| 0xeffff
| 0xffffe
| 0xfffff
| 0x10fffe
| 0x10ffff
};
}
pub(crate) use noncharacter_pat;
#[allow(missing_docs)]
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub enum State {
Data,
Rcdata,
Rawtext,
ScriptData,
Plaintext,
TagOpen,
EndTagOpen,
TagName,
RcdataLessThanSign,
RcdataEndTagOpen,
RcdataEndTagName,
RawTextLessThanSign,
RawTextEndTagOpen,
RawTextEndTagName,
ScriptDataLessThanSign,
ScriptDataEndTagOpen,
ScriptDataEndTagName,
ScriptDataEscapeStart,
ScriptDataEscapeStartDash,
ScriptDataEscaped,
ScriptDataEscapedDash,
ScriptDataEscapedDashDash,
ScriptDataEscapedLessThanSign,
ScriptDataEscapedEndTagOpen,
ScriptDataEscapedEndTagName,
ScriptDataDoubleEscapeStart,
ScriptDataDoubleEscaped,
ScriptDataDoubleEscapedDash,
ScriptDataDoubleEscapedDashDash,
ScriptDataDoubleEscapedLessThanSign,
ScriptDataDoubleEscapeEnd,
BeforeAttributeName,
AttributeName,
AfterAttributeName,
BeforeAttributeValue,
AttributeValueDoubleQuoted,
AttributeValueSingleQuoted,
AttributeValueUnquoted,
AfterAttributeValueQuoted,
SelfClosingStartTag,
BogusComment,
MarkupDeclarationOpen,
CommentStart,
CommentStartDash,
Comment,
CommentLessThanSign,
CommentLessThanSignBang,
CommentLessThanSignBangDash,
CommentLessThanSignBangDashDash,
CommentEndDash,
CommentEnd,
CommentEndBang,
Doctype,
BeforeDoctypeName,
DoctypeName,
AfterDoctypeName,
AfterDoctypePublicKeyword,
BeforeDoctypePublicIdentifier,
DoctypePublicIdentifierDoubleQuoted,
DoctypePublicIdentifierSingleQuoted,
AfterDoctypePublicIdentifier,
BetweenDoctypePublicAndSystemIdentifiers,
AfterDoctypeSystemKeyword,
BeforeDoctypeSystemIdentifier,
DoctypeSystemIdentifierDoubleQuoted,
DoctypeSystemIdentifierSingleQuoted,
AfterDoctypeSystemIdentifier,
BogusDoctype,
CdataSection,
CdataSectionBracket,
CdataSectionEnd,
CharacterReference,
NamedCharacterReference,
AmbiguousAmpersand,
NumericCharacterReference,
HexadecimalCharacterReferenceStart,
DecimalCharacterReferenceStart,
HexadecimalCharacterReference,
DecimalCharacterReference,
NumericCharacterReferenceEnd,
}
macro_rules! ctostr {
($c:expr) => {
&*$c.encode_utf8(&mut [0; 4])
};
}
pub(crate) use ctostr;