use crate::parser::inlines::inline_html::{parse_close_tag, parse_open_tag};
use crate::syntax::SyntaxKind;
use rowan::GreenNodeBuilder;
use super::blockquotes::{count_blockquote_markers, strip_n_blockquote_markers};
use crate::parser::utils::helpers::{strip_leading_spaces, strip_newline};
const BLOCK_TAGS: &[&str] = &[
"address",
"article",
"aside",
"base",
"basefont",
"blockquote",
"body",
"caption",
"center",
"col",
"colgroup",
"dd",
"details",
"dialog",
"dir",
"div",
"dl",
"dt",
"fieldset",
"figcaption",
"figure",
"footer",
"form",
"frame",
"frameset",
"h1",
"h2",
"h3",
"h4",
"h5",
"h6",
"head",
"header",
"hr",
"html",
"iframe",
"legend",
"li",
"link",
"main",
"menu",
"menuitem",
"nav",
"noframes",
"ol",
"optgroup",
"option",
"p",
"param",
"section",
"source",
"summary",
"table",
"tbody",
"td",
"tfoot",
"th",
"thead",
"title",
"tr",
"track",
"ul",
];
const VERBATIM_TAGS: &[&str] = &["script", "style", "pre", "textarea"];
pub fn is_html_block_tag_name(name: &str) -> bool {
let lower = name.to_ascii_lowercase();
BLOCK_TAGS.contains(&lower.as_str())
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub(crate) enum HtmlBlockType {
Comment,
ProcessingInstruction,
Declaration,
CData,
BlockTag {
tag_name: String,
is_verbatim: bool,
closed_by_blank_line: bool,
depth_aware: bool,
},
Type7,
}
pub(crate) fn try_parse_html_block_start(
content: &str,
is_commonmark: bool,
) -> Option<HtmlBlockType> {
let trimmed = strip_leading_spaces(content);
if !trimmed.starts_with('<') {
return None;
}
if trimmed.starts_with("<!--") {
return Some(HtmlBlockType::Comment);
}
if trimmed.starts_with("<?") {
return Some(HtmlBlockType::ProcessingInstruction);
}
if is_commonmark && trimmed.starts_with("<![CDATA[") {
return Some(HtmlBlockType::CData);
}
if is_commonmark && trimmed.starts_with("<!") && trimmed.len() > 2 {
let after_bang = &trimmed[2..];
if after_bang.chars().next()?.is_ascii_alphabetic() {
return Some(HtmlBlockType::Declaration);
}
}
if let Some(tag_name) = extract_block_tag_name(trimmed, is_commonmark) {
let tag_lower = tag_name.to_lowercase();
let is_closing = trimmed.starts_with("</");
if BLOCK_TAGS.contains(&tag_lower.as_str()) {
let is_verbatim = VERBATIM_TAGS.contains(&tag_lower.as_str());
return Some(HtmlBlockType::BlockTag {
tag_name: tag_lower,
is_verbatim,
closed_by_blank_line: is_commonmark && !is_verbatim,
depth_aware: !is_commonmark,
});
}
if !is_closing && VERBATIM_TAGS.contains(&tag_lower.as_str()) {
return Some(HtmlBlockType::BlockTag {
tag_name: tag_lower,
is_verbatim: true,
closed_by_blank_line: false,
depth_aware: !is_commonmark,
});
}
}
if is_commonmark && let Some(end) = parse_open_tag(trimmed).or_else(|| parse_close_tag(trimmed))
{
let rest = &trimmed[end..];
let only_ws = rest
.bytes()
.all(|b| matches!(b, b' ' | b'\t' | b'\n' | b'\r'));
if only_ws {
let leading = trimmed.strip_prefix("</").unwrap_or_else(|| &trimmed[1..]);
let name_end = leading
.find(|c: char| !(c.is_ascii_alphanumeric() || c == '-'))
.unwrap_or(leading.len());
let name = leading[..name_end].to_ascii_lowercase();
if !VERBATIM_TAGS.contains(&name.as_str()) {
return Some(HtmlBlockType::Type7);
}
}
}
None
}
fn extract_block_tag_name(text: &str, accept_closing: bool) -> Option<String> {
if !text.starts_with('<') {
return None;
}
let after_bracket = &text[1..];
let after_slash = if let Some(stripped) = after_bracket.strip_prefix('/') {
if !accept_closing {
return None;
}
stripped
} else {
after_bracket
};
let tag_end = after_slash
.find(|c: char| c.is_whitespace() || c == '>' || c == '/')
.unwrap_or(after_slash.len());
if tag_end == 0 {
return None;
}
let tag_name = &after_slash[..tag_end];
if !tag_name.chars().next()?.is_ascii_alphabetic() {
return None;
}
if !tag_name.chars().all(|c| c.is_ascii_alphanumeric()) {
return None;
}
Some(tag_name.to_string())
}
fn ends_at_blank_line(block_type: &HtmlBlockType) -> bool {
matches!(
block_type,
HtmlBlockType::Type7
| HtmlBlockType::BlockTag {
closed_by_blank_line: true,
..
}
)
}
fn is_closing_marker(line: &str, block_type: &HtmlBlockType) -> bool {
match block_type {
HtmlBlockType::Comment => line.contains("-->"),
HtmlBlockType::ProcessingInstruction => line.contains("?>"),
HtmlBlockType::Declaration => line.contains('>'),
HtmlBlockType::CData => line.contains("]]>"),
HtmlBlockType::BlockTag {
tag_name,
closed_by_blank_line: false,
..
} => {
let closing_tag = format!("</{}>", tag_name);
line.to_lowercase().contains(&closing_tag)
}
HtmlBlockType::BlockTag {
closed_by_blank_line: true,
..
}
| HtmlBlockType::Type7 => false,
}
}
fn count_tag_balance(line: &str, tag_name: &str) -> (usize, usize) {
let bytes = line.as_bytes();
let lower_line = line.to_ascii_lowercase();
let lower_bytes = lower_line.as_bytes();
let tag_lower = tag_name.to_ascii_lowercase();
let tag_bytes = tag_lower.as_bytes();
let mut opens = 0usize;
let mut closes = 0usize;
let mut i = 0usize;
while i < bytes.len() {
if bytes[i] != b'<' {
i += 1;
continue;
}
let after = i + 1;
let is_close = after < bytes.len() && bytes[after] == b'/';
let name_start = if is_close { after + 1 } else { after };
let matched = name_start + tag_bytes.len() <= bytes.len()
&& &lower_bytes[name_start..name_start + tag_bytes.len()] == tag_bytes;
let after_name = name_start + tag_bytes.len();
let is_boundary = matched
&& matches!(
bytes.get(after_name).copied(),
Some(b' ' | b'\t' | b'\n' | b'\r' | b'>' | b'/') | None
);
let mut j = if matched { after_name } else { after };
let mut quote: Option<u8> = None;
let mut self_close = false;
let mut found_gt = false;
while j < bytes.len() {
let b = bytes[j];
match (quote, b) {
(Some(q), x) if x == q => quote = None,
(None, b'"') | (None, b'\'') => quote = Some(b),
(None, b'>') => {
found_gt = true;
if j > i + 1 && bytes[j - 1] == b'/' {
self_close = true;
}
break;
}
_ => {}
}
j += 1;
}
if matched && is_boundary {
if is_close {
closes += 1;
} else if !self_close {
opens += 1;
}
}
if found_gt {
i = j + 1;
} else {
break;
}
}
(opens, closes)
}
pub(crate) fn parse_html_block_with_wrapper(
builder: &mut GreenNodeBuilder<'static>,
lines: &[&str],
start_pos: usize,
block_type: HtmlBlockType,
bq_depth: usize,
wrapper_kind: SyntaxKind,
) -> usize {
builder.start_node(wrapper_kind.into());
let first_line = lines[start_pos];
let blank_terminated = ends_at_blank_line(&block_type);
let first_inner = if bq_depth > 0 {
strip_n_blockquote_markers(first_line, bq_depth)
} else {
first_line
};
builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
let (line_without_newline, newline_str) = strip_newline(first_inner);
if !line_without_newline.is_empty() {
if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
emit_div_open_tag_tokens(builder, line_without_newline);
} else {
builder.token(SyntaxKind::TEXT.into(), line_without_newline);
}
}
if !newline_str.is_empty() {
builder.token(SyntaxKind::NEWLINE.into(), newline_str);
}
builder.finish_node();
let depth_aware_tag: Option<String> = match &block_type {
HtmlBlockType::BlockTag {
tag_name,
closed_by_blank_line: false,
depth_aware: true,
..
} => Some(tag_name.clone()),
_ => None,
};
let mut depth: i64 = 1;
if let Some(tag_name) = &depth_aware_tag {
let (opens, closes) = count_tag_balance(first_inner, tag_name);
depth = opens as i64 - closes as i64;
}
let same_line_closed = !blank_terminated
&& match &depth_aware_tag {
Some(_) => depth <= 0,
None => is_closing_marker(first_inner, &block_type),
};
if same_line_closed {
log::trace!(
"HTML block at line {} opens and closes on same line",
start_pos + 1
);
builder.finish_node(); return start_pos + 1;
}
let mut current_pos = start_pos + 1;
let mut content_lines: Vec<&str> = Vec::new();
let mut found_closing = false;
while current_pos < lines.len() {
let line = lines[current_pos];
let (line_bq_depth, inner) = count_blockquote_markers(line);
if line_bq_depth < bq_depth {
break;
}
if blank_terminated && inner.trim().is_empty() {
break;
}
let line_closes = match &depth_aware_tag {
Some(tag_name) => {
let (opens, closes) = count_tag_balance(inner, tag_name);
depth += opens as i64;
depth -= closes as i64;
depth <= 0
}
None => is_closing_marker(inner, &block_type),
};
if line_closes {
log::trace!("Found HTML block closing at line {}", current_pos + 1);
found_closing = true;
if !content_lines.is_empty() {
builder.start_node(SyntaxKind::HTML_BLOCK_CONTENT.into());
for content_line in &content_lines {
emit_html_block_line(builder, content_line, bq_depth);
}
builder.finish_node();
}
builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
emit_html_block_line(builder, line, bq_depth);
builder.finish_node();
current_pos += 1;
break;
}
content_lines.push(line);
current_pos += 1;
}
if !found_closing {
log::trace!("HTML block at line {} has no closing marker", start_pos + 1);
if !content_lines.is_empty() {
builder.start_node(SyntaxKind::HTML_BLOCK_CONTENT.into());
for content_line in &content_lines {
emit_html_block_line(builder, content_line, bq_depth);
}
builder.finish_node();
}
}
builder.finish_node(); current_pos
}
fn emit_div_open_tag_tokens(builder: &mut GreenNodeBuilder<'static>, line: &str) {
let bytes = line.as_bytes();
let indent_end = bytes.iter().position(|&b| b != b' ').unwrap_or(bytes.len());
if indent_end > 0 {
builder.token(SyntaxKind::WHITESPACE.into(), &line[..indent_end]);
}
let rest = &line[indent_end..];
if !rest.starts_with('<') || rest.len() < 4 || !rest[1..4].eq_ignore_ascii_case("div") {
builder.token(SyntaxKind::TEXT.into(), rest);
return;
}
let after_name = &rest[4..];
let after_name_bytes = after_name.as_bytes();
let mut i = 0usize;
let mut quote: Option<u8> = None;
let mut tag_close: Option<usize> = None;
while i < after_name_bytes.len() {
let b = after_name_bytes[i];
match (quote, b) {
(None, b'"') | (None, b'\'') => quote = Some(b),
(Some(q), b2) if b2 == q => quote = None,
(None, b'>') => {
tag_close = Some(i);
break;
}
_ => {}
}
i += 1;
}
let Some(tag_close) = tag_close else {
builder.token(SyntaxKind::TEXT.into(), rest);
return;
};
let attrs_inner = &after_name[..tag_close];
let ws_end = attrs_inner
.as_bytes()
.iter()
.position(|&b| !matches!(b, b' ' | b'\t'))
.unwrap_or(attrs_inner.len());
let leading_ws = &attrs_inner[..ws_end];
let attrs_after_ws = &attrs_inner[ws_end..];
let mut attr_end = attrs_after_ws.len();
let attr_bytes = attrs_after_ws.as_bytes();
let mut self_close_start = attr_end;
if attr_end > 0 && attr_bytes[attr_end - 1] == b'/' {
self_close_start = attr_end - 1;
attr_end = self_close_start;
while attr_end > 0 && matches!(attr_bytes[attr_end - 1], b' ' | b'\t') {
attr_end -= 1;
}
}
let attrs_text = &attrs_after_ws[..attr_end];
let trailing_text = &attrs_after_ws[attr_end..self_close_start.max(attr_end)];
let after_self_close = &attrs_after_ws[self_close_start..];
builder.token(SyntaxKind::TEXT.into(), "<div");
if !leading_ws.is_empty() {
builder.token(SyntaxKind::WHITESPACE.into(), leading_ws);
}
if !attrs_text.is_empty() {
builder.start_node(SyntaxKind::HTML_ATTRS.into());
builder.token(SyntaxKind::TEXT.into(), attrs_text);
builder.finish_node();
}
if !trailing_text.is_empty() {
builder.token(SyntaxKind::WHITESPACE.into(), trailing_text);
}
if !after_self_close.is_empty() {
builder.token(SyntaxKind::TEXT.into(), after_self_close);
}
builder.token(SyntaxKind::TEXT.into(), ">");
let after_gt = &after_name[tag_close + 1..];
if !after_gt.is_empty() {
builder.token(SyntaxKind::TEXT.into(), after_gt);
}
}
fn emit_html_block_line(builder: &mut GreenNodeBuilder<'static>, line: &str, bq_depth: usize) {
let inner = if bq_depth > 0 {
let stripped = strip_n_blockquote_markers(line, bq_depth);
let prefix_len = line.len() - stripped.len();
if prefix_len > 0 {
for ch in line[..prefix_len].chars() {
if ch == '>' {
builder.token(SyntaxKind::BLOCK_QUOTE_MARKER.into(), ">");
} else {
let mut buf = [0u8; 4];
builder.token(SyntaxKind::WHITESPACE.into(), ch.encode_utf8(&mut buf));
}
}
}
stripped
} else {
line
};
let (line_without_newline, newline_str) = strip_newline(inner);
if !line_without_newline.is_empty() {
builder.token(SyntaxKind::TEXT.into(), line_without_newline);
}
if !newline_str.is_empty() {
builder.token(SyntaxKind::NEWLINE.into(), newline_str);
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_try_parse_html_comment() {
assert_eq!(
try_parse_html_block_start("<!-- comment -->", false),
Some(HtmlBlockType::Comment)
);
assert_eq!(
try_parse_html_block_start(" <!-- comment -->", false),
Some(HtmlBlockType::Comment)
);
}
#[test]
fn test_try_parse_div_tag() {
assert_eq!(
try_parse_html_block_start("<div>", false),
Some(HtmlBlockType::BlockTag {
tag_name: "div".to_string(),
is_verbatim: false,
closed_by_blank_line: false,
depth_aware: true,
})
);
assert_eq!(
try_parse_html_block_start("<div class=\"test\">", false),
Some(HtmlBlockType::BlockTag {
tag_name: "div".to_string(),
is_verbatim: false,
closed_by_blank_line: false,
depth_aware: true,
})
);
}
#[test]
fn test_try_parse_script_tag() {
assert_eq!(
try_parse_html_block_start("<script>", false),
Some(HtmlBlockType::BlockTag {
tag_name: "script".to_string(),
is_verbatim: true,
closed_by_blank_line: false,
depth_aware: true,
})
);
}
#[test]
fn test_try_parse_processing_instruction() {
assert_eq!(
try_parse_html_block_start("<?xml version=\"1.0\"?>", false),
Some(HtmlBlockType::ProcessingInstruction)
);
}
#[test]
fn test_try_parse_declaration() {
assert_eq!(
try_parse_html_block_start("<!DOCTYPE html>", true),
Some(HtmlBlockType::Declaration)
);
assert_eq!(
try_parse_html_block_start("<!doctype html>", true),
Some(HtmlBlockType::Declaration)
);
assert_eq!(try_parse_html_block_start("<!DOCTYPE html>", false), None);
assert_eq!(try_parse_html_block_start("<!doctype html>", false), None);
}
#[test]
fn test_try_parse_cdata() {
assert_eq!(
try_parse_html_block_start("<![CDATA[content]]>", true),
Some(HtmlBlockType::CData)
);
assert_eq!(
try_parse_html_block_start("<![CDATA[content]]>", false),
None
);
}
#[test]
fn test_extract_block_tag_name_open_only() {
assert_eq!(
extract_block_tag_name("<div>", false),
Some("div".to_string())
);
assert_eq!(
extract_block_tag_name("<div class=\"test\">", false),
Some("div".to_string())
);
assert_eq!(
extract_block_tag_name("<div/>", false),
Some("div".to_string())
);
assert_eq!(extract_block_tag_name("</div>", false), None);
assert_eq!(extract_block_tag_name("<>", false), None);
assert_eq!(extract_block_tag_name("< div>", false), None);
}
#[test]
fn test_extract_block_tag_name_with_closing() {
assert_eq!(
extract_block_tag_name("</div>", true),
Some("div".to_string())
);
assert_eq!(
extract_block_tag_name("</div >", true),
Some("div".to_string())
);
}
#[test]
fn test_commonmark_type6_closing_tag_start() {
assert_eq!(
try_parse_html_block_start("</div>", true),
Some(HtmlBlockType::BlockTag {
tag_name: "div".to_string(),
is_verbatim: false,
closed_by_blank_line: true,
depth_aware: false,
})
);
}
#[test]
fn test_commonmark_type7_open_tag() {
assert_eq!(
try_parse_html_block_start("<a href=\"foo\">", true),
Some(HtmlBlockType::Type7)
);
assert_eq!(try_parse_html_block_start("<a href=\"foo\">", false), None);
}
#[test]
fn test_commonmark_type7_close_tag() {
assert_eq!(
try_parse_html_block_start("</ins>", true),
Some(HtmlBlockType::Type7)
);
}
#[test]
fn test_commonmark_type7_rejects_with_trailing_text() {
assert_eq!(try_parse_html_block_start("<a> hi", true), None);
}
#[test]
fn test_is_closing_marker_comment() {
let block_type = HtmlBlockType::Comment;
assert!(is_closing_marker("-->", &block_type));
assert!(is_closing_marker("end -->", &block_type));
assert!(!is_closing_marker("<!--", &block_type));
}
#[test]
fn test_is_closing_marker_tag() {
let block_type = HtmlBlockType::BlockTag {
tag_name: "div".to_string(),
is_verbatim: false,
closed_by_blank_line: false,
depth_aware: false,
};
assert!(is_closing_marker("</div>", &block_type));
assert!(is_closing_marker("</DIV>", &block_type)); assert!(is_closing_marker("content</div>", &block_type));
assert!(!is_closing_marker("<div>", &block_type));
}
#[test]
fn test_parse_html_comment_block() {
let input = "<!-- comment -->\n";
let lines: Vec<&str> = input.lines().collect();
let mut builder = GreenNodeBuilder::new();
let block_type = try_parse_html_block_start(lines[0], false).unwrap();
let new_pos = parse_html_block_with_wrapper(
&mut builder,
&lines,
0,
block_type,
0,
SyntaxKind::HTML_BLOCK,
);
assert_eq!(new_pos, 1);
}
#[test]
fn test_parse_div_block() {
let input = "<div>\ncontent\n</div>\n";
let lines: Vec<&str> = input.lines().collect();
let mut builder = GreenNodeBuilder::new();
let block_type = try_parse_html_block_start(lines[0], false).unwrap();
let new_pos = parse_html_block_with_wrapper(
&mut builder,
&lines,
0,
block_type,
0,
SyntaxKind::HTML_BLOCK,
);
assert_eq!(new_pos, 3);
}
#[test]
fn test_parse_html_block_no_closing() {
let input = "<div>\ncontent\n";
let lines: Vec<&str> = input.lines().collect();
let mut builder = GreenNodeBuilder::new();
let block_type = try_parse_html_block_start(lines[0], false).unwrap();
let new_pos = parse_html_block_with_wrapper(
&mut builder,
&lines,
0,
block_type,
0,
SyntaxKind::HTML_BLOCK,
);
assert_eq!(new_pos, 2);
}
#[test]
fn test_parse_div_block_nested_pandoc() {
let input =
"<div id=\"outer\">\n\n<div id=\"inner\">\n\ndeep content\n\n</div>\n\n</div>\n";
let lines: Vec<&str> = input.lines().collect();
let mut builder = GreenNodeBuilder::new();
let block_type = try_parse_html_block_start(lines[0], false).unwrap();
let new_pos = parse_html_block_with_wrapper(
&mut builder,
&lines,
0,
block_type,
0,
SyntaxKind::HTML_BLOCK_DIV,
);
assert_eq!(new_pos, 9);
}
#[test]
fn test_parse_div_block_same_line_pandoc() {
let input = "<div>foo</div>\n";
let lines: Vec<&str> = input.lines().collect();
let mut builder = GreenNodeBuilder::new();
let block_type = try_parse_html_block_start(lines[0], false).unwrap();
let new_pos = parse_html_block_with_wrapper(
&mut builder,
&lines,
0,
block_type,
0,
SyntaxKind::HTML_BLOCK_DIV,
);
assert_eq!(new_pos, 1);
}
#[test]
fn test_commonmark_verbatim_first_close() {
let input = "<script>\nlet x = '<script>';\n</script>\n";
let lines: Vec<&str> = input.lines().collect();
let mut builder = GreenNodeBuilder::new();
let block_type = try_parse_html_block_start(lines[0], true).unwrap();
let new_pos = parse_html_block_with_wrapper(
&mut builder,
&lines,
0,
block_type,
0,
SyntaxKind::HTML_BLOCK,
);
assert_eq!(new_pos, 3);
}
#[test]
fn test_commonmark_type6_blank_line_terminates() {
let input = "<div>\nfoo\n\nbar\n";
let lines: Vec<&str> = input.lines().collect();
let mut builder = GreenNodeBuilder::new();
let block_type = try_parse_html_block_start(lines[0], true).unwrap();
let new_pos = parse_html_block_with_wrapper(
&mut builder,
&lines,
0,
block_type,
0,
SyntaxKind::HTML_BLOCK,
);
assert_eq!(new_pos, 2);
}
}