use memchr::memchr;
use ox_content_ast::{Html, Node, Span};
use super::Parser;
use crate::error::ParseResult;
#[allow(unused_imports)]
use crate::profile_span;
#[derive(Clone, Copy)]
pub(super) enum HtmlBlockStart {
Comment,
Type1(Type1HtmlBlockTag),
Other,
}
#[derive(Clone, Copy)]
pub(super) enum Type1HtmlBlockTag {
Pre,
Script,
Style,
Textarea,
}
impl Type1HtmlBlockTag {
fn closing_name(self) -> &'static [u8] {
match self {
Self::Pre => b"pre",
Self::Script => b"script",
Self::Style => b"style",
Self::Textarea => b"textarea",
}
}
}
impl<'a> Parser<'a> {
pub(super) fn parse_html_block_start(trimmed: &str) -> Option<HtmlBlockStart> {
if trimmed.starts_with("<!--") {
return Some(HtmlBlockStart::Comment);
}
let tag_name = Self::parse_html_block_tag_name_from_trimmed(trimmed)?;
Self::html_block_start_for_tag(tag_name)
}
pub(super) fn parse_html_block(
&mut self,
start: usize,
block_start: HtmlBlockStart,
) -> ParseResult<Option<Node<'a>>> {
profile_span!("parser::parse_html_block");
match block_start {
HtmlBlockStart::Comment => loop {
let consumed = self.consume_line();
if consumed.contains("-->") || self.is_at_end() {
break;
}
},
HtmlBlockStart::Type1(tag) => {
let tag_bytes = tag.closing_name();
loop {
let consumed = self.consume_line();
if ascii_contains_closing_tag(consumed, tag_bytes) || self.is_at_end() {
break;
}
}
}
HtmlBlockStart::Other => {
self.consume_line();
self.advance_html_block_until_blank();
}
}
let span = Span::new(start as u32, self.position as u32);
let value = &self.source[start..self.position];
Ok(Some(Node::Html(Html { value, span })))
}
fn parse_html_block_tag_name_from_trimmed(trimmed: &str) -> Option<&str> {
let after_open = trimmed.strip_prefix('<')?;
let after_slash = after_open.strip_prefix('/').unwrap_or(after_open);
let mut tag_len = 0;
for byte in after_slash.as_bytes() {
if byte.is_ascii_alphanumeric() || *byte == b'-' {
tag_len += 1;
} else {
break;
}
}
if tag_len == 0 {
return None;
}
let tag_name = &after_slash[..tag_len];
let next = after_slash.as_bytes().get(tag_len).copied();
if let Some(byte) = next {
if !matches!(byte, b' ' | b'\t' | b'>' | b'/') {
return None;
}
}
Some(tag_name)
}
fn html_block_start_for_tag(tag_name: &str) -> Option<HtmlBlockStart> {
let other = HtmlBlockStart::Other;
match tag_name.len() {
1 if tag_name.eq_ignore_ascii_case("p") => Some(other),
2 if tag_name.eq_ignore_ascii_case("ol")
|| tag_name.eq_ignore_ascii_case("td")
|| tag_name.eq_ignore_ascii_case("th")
|| tag_name.eq_ignore_ascii_case("tr")
|| tag_name.eq_ignore_ascii_case("ul") =>
{
Some(other)
}
3 if tag_name.eq_ignore_ascii_case("pre") => {
Some(HtmlBlockStart::Type1(Type1HtmlBlockTag::Pre))
}
3 if tag_name.eq_ignore_ascii_case("div") || tag_name.eq_ignore_ascii_case("nav") => {
Some(other)
}
4 if tag_name.eq_ignore_ascii_case("main") => Some(other),
5 if tag_name.eq_ignore_ascii_case("style") => {
Some(HtmlBlockStart::Type1(Type1HtmlBlockTag::Style))
}
5 if tag_name.eq_ignore_ascii_case("aside")
|| tag_name.eq_ignore_ascii_case("table")
|| tag_name.eq_ignore_ascii_case("tbody")
|| tag_name.eq_ignore_ascii_case("tfoot")
|| tag_name.eq_ignore_ascii_case("thead") =>
{
Some(other)
}
6 if tag_name.eq_ignore_ascii_case("script") => {
Some(HtmlBlockStart::Type1(Type1HtmlBlockTag::Script))
}
6 if tag_name.eq_ignore_ascii_case("dialog")
|| tag_name.eq_ignore_ascii_case("figure")
|| tag_name.eq_ignore_ascii_case("footer")
|| tag_name.eq_ignore_ascii_case("header") =>
{
Some(other)
}
7 if tag_name.eq_ignore_ascii_case("article")
|| tag_name.eq_ignore_ascii_case("details")
|| tag_name.eq_ignore_ascii_case("section")
|| tag_name.eq_ignore_ascii_case("summary") =>
{
Some(other)
}
8 if tag_name.eq_ignore_ascii_case("textarea") => {
Some(HtmlBlockStart::Type1(Type1HtmlBlockTag::Textarea))
}
10 if tag_name.eq_ignore_ascii_case("blockquote")
|| tag_name.eq_ignore_ascii_case("figcaption") =>
{
Some(other)
}
_ => None,
}
}
fn advance_html_block_until_blank(&mut self) {
let bytes = self.source.as_bytes();
let mut pos = self.position;
while pos < bytes.len() {
let line_start = pos;
let mut scan = pos;
while scan < bytes.len() {
match bytes[scan] {
b'\n' => {
self.position = line_start;
return;
}
b' ' | b'\t' | b'\r' => scan += 1,
_ => break,
}
}
if scan >= bytes.len() {
self.position = line_start;
return;
}
pos = memchr(b'\n', &bytes[scan..]).map_or(bytes.len(), |off| scan + off + 1);
}
self.position = pos;
}
}
pub(super) fn ascii_contains_closing_tag(haystack: &str, tag: &[u8]) -> bool {
let bytes = haystack.as_bytes();
if bytes.len() < tag.len() + 2 {
return false;
}
let mut search_start = 0;
while let Some(off) = memchr(b'<', &bytes[search_start..]) {
let i = search_start + off;
if i + tag.len() + 2 <= bytes.len() && bytes[i + 1] == b'/' {
let candidate = &bytes[i + 2..i + 2 + tag.len()];
if candidate.eq_ignore_ascii_case(tag) {
return true;
}
}
search_start = i + 1;
}
false
}