use crate::parser::inlines::inline_html::{parse_close_tag, parse_open_tag};
use crate::syntax::SyntaxKind;
use rowan::GreenNodeBuilder;
use super::blockquotes::{count_blockquote_markers, strip_n_blockquote_markers};
use crate::parser::utils::helpers::{strip_leading_spaces, strip_newline};
const BLOCK_TAGS: &[&str] = &[
"address",
"article",
"aside",
"base",
"basefont",
"blockquote",
"body",
"caption",
"center",
"col",
"colgroup",
"dd",
"details",
"dialog",
"dir",
"div",
"dl",
"dt",
"fieldset",
"figcaption",
"figure",
"footer",
"form",
"frame",
"frameset",
"h1",
"h2",
"h3",
"h4",
"h5",
"h6",
"head",
"header",
"hr",
"html",
"iframe",
"legend",
"li",
"link",
"main",
"menu",
"menuitem",
"nav",
"noframes",
"ol",
"optgroup",
"option",
"p",
"param",
"section",
"source",
"summary",
"table",
"tbody",
"td",
"tfoot",
"th",
"thead",
"title",
"tr",
"track",
"ul",
];
const VERBATIM_TAGS: &[&str] = &["script", "style", "pre", "textarea"];
#[derive(Debug, Clone, PartialEq, Eq)]
pub(crate) enum HtmlBlockType {
Comment,
ProcessingInstruction,
Declaration,
CData,
BlockTag {
tag_name: String,
is_verbatim: bool,
closed_by_blank_line: bool,
},
Type7,
}
pub(crate) fn try_parse_html_block_start(
content: &str,
is_commonmark: bool,
) -> Option<HtmlBlockType> {
let trimmed = strip_leading_spaces(content);
if !trimmed.starts_with('<') {
return None;
}
if trimmed.starts_with("<!--") {
return Some(HtmlBlockType::Comment);
}
if trimmed.starts_with("<?") {
return Some(HtmlBlockType::ProcessingInstruction);
}
if trimmed.starts_with("<![CDATA[") {
return Some(HtmlBlockType::CData);
}
if trimmed.starts_with("<!") && trimmed.len() > 2 {
let after_bang = &trimmed[2..];
if after_bang.chars().next()?.is_ascii_uppercase() {
return Some(HtmlBlockType::Declaration);
}
}
if let Some(tag_name) = extract_block_tag_name(trimmed, is_commonmark) {
let tag_lower = tag_name.to_lowercase();
if BLOCK_TAGS.contains(&tag_lower.as_str()) {
let is_verbatim = VERBATIM_TAGS.contains(&tag_lower.as_str());
return Some(HtmlBlockType::BlockTag {
tag_name: tag_lower,
is_verbatim,
closed_by_blank_line: is_commonmark && !is_verbatim,
});
}
if VERBATIM_TAGS.contains(&tag_lower.as_str()) {
return Some(HtmlBlockType::BlockTag {
tag_name: tag_lower,
is_verbatim: true,
closed_by_blank_line: false,
});
}
}
if is_commonmark && let Some(end) = parse_open_tag(trimmed).or_else(|| parse_close_tag(trimmed))
{
let rest = &trimmed[end..];
let only_ws = rest
.bytes()
.all(|b| matches!(b, b' ' | b'\t' | b'\n' | b'\r'));
if only_ws {
let leading = trimmed.strip_prefix("</").unwrap_or_else(|| &trimmed[1..]);
let name_end = leading
.find(|c: char| !(c.is_ascii_alphanumeric() || c == '-'))
.unwrap_or(leading.len());
let name = leading[..name_end].to_ascii_lowercase();
if !VERBATIM_TAGS.contains(&name.as_str()) {
return Some(HtmlBlockType::Type7);
}
}
}
None
}
fn extract_block_tag_name(text: &str, accept_closing: bool) -> Option<String> {
if !text.starts_with('<') {
return None;
}
let after_bracket = &text[1..];
let after_slash = if let Some(stripped) = after_bracket.strip_prefix('/') {
if !accept_closing {
return None;
}
stripped
} else {
after_bracket
};
let tag_end = after_slash
.find(|c: char| c.is_whitespace() || c == '>' || c == '/')
.unwrap_or(after_slash.len());
if tag_end == 0 {
return None;
}
let tag_name = &after_slash[..tag_end];
if !tag_name.chars().next()?.is_ascii_alphabetic() {
return None;
}
if !tag_name.chars().all(|c| c.is_ascii_alphanumeric()) {
return None;
}
Some(tag_name.to_string())
}
fn ends_at_blank_line(block_type: &HtmlBlockType) -> bool {
matches!(
block_type,
HtmlBlockType::Type7
| HtmlBlockType::BlockTag {
closed_by_blank_line: true,
..
}
)
}
fn is_closing_marker(line: &str, block_type: &HtmlBlockType) -> bool {
match block_type {
HtmlBlockType::Comment => line.contains("-->"),
HtmlBlockType::ProcessingInstruction => line.contains("?>"),
HtmlBlockType::Declaration => line.contains('>'),
HtmlBlockType::CData => line.contains("]]>"),
HtmlBlockType::BlockTag {
tag_name,
closed_by_blank_line: false,
..
} => {
let closing_tag = format!("</{}>", tag_name);
line.to_lowercase().contains(&closing_tag)
}
HtmlBlockType::BlockTag {
closed_by_blank_line: true,
..
}
| HtmlBlockType::Type7 => false,
}
}
pub(crate) fn parse_html_block(
builder: &mut GreenNodeBuilder<'static>,
lines: &[&str],
start_pos: usize,
block_type: HtmlBlockType,
bq_depth: usize,
) -> usize {
builder.start_node(SyntaxKind::HTML_BLOCK.into());
let first_line = lines[start_pos];
let blank_terminated = ends_at_blank_line(&block_type);
let first_inner = if bq_depth > 0 {
strip_n_blockquote_markers(first_line, bq_depth)
} else {
first_line
};
builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
let (line_without_newline, newline_str) = strip_newline(first_inner);
if !line_without_newline.is_empty() {
builder.token(SyntaxKind::TEXT.into(), line_without_newline);
}
if !newline_str.is_empty() {
builder.token(SyntaxKind::NEWLINE.into(), newline_str);
}
builder.finish_node();
if !blank_terminated && is_closing_marker(first_inner, &block_type) {
log::trace!(
"HTML block at line {} opens and closes on same line",
start_pos + 1
);
builder.finish_node(); return start_pos + 1;
}
let mut current_pos = start_pos + 1;
let mut content_lines: Vec<&str> = Vec::new();
let mut found_closing = false;
while current_pos < lines.len() {
let line = lines[current_pos];
let (line_bq_depth, inner) = count_blockquote_markers(line);
if line_bq_depth < bq_depth {
break;
}
if blank_terminated && inner.trim().is_empty() {
break;
}
if is_closing_marker(inner, &block_type) {
log::trace!("Found HTML block closing at line {}", current_pos + 1);
found_closing = true;
if !content_lines.is_empty() {
builder.start_node(SyntaxKind::HTML_BLOCK_CONTENT.into());
for content_line in &content_lines {
emit_html_block_line(builder, content_line, bq_depth);
}
builder.finish_node();
}
builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
emit_html_block_line(builder, line, bq_depth);
builder.finish_node();
current_pos += 1;
break;
}
content_lines.push(line);
current_pos += 1;
}
if !found_closing {
log::trace!("HTML block at line {} has no closing marker", start_pos + 1);
if !content_lines.is_empty() {
builder.start_node(SyntaxKind::HTML_BLOCK_CONTENT.into());
for content_line in &content_lines {
emit_html_block_line(builder, content_line, bq_depth);
}
builder.finish_node();
}
}
builder.finish_node(); current_pos
}
fn emit_html_block_line(builder: &mut GreenNodeBuilder<'static>, line: &str, bq_depth: usize) {
let inner = if bq_depth > 0 {
let stripped = strip_n_blockquote_markers(line, bq_depth);
let prefix_len = line.len() - stripped.len();
if prefix_len > 0 {
for ch in line[..prefix_len].chars() {
if ch == '>' {
builder.token(SyntaxKind::BLOCK_QUOTE_MARKER.into(), ">");
} else {
let mut buf = [0u8; 4];
builder.token(SyntaxKind::WHITESPACE.into(), ch.encode_utf8(&mut buf));
}
}
}
stripped
} else {
line
};
let (line_without_newline, newline_str) = strip_newline(inner);
if !line_without_newline.is_empty() {
builder.token(SyntaxKind::TEXT.into(), line_without_newline);
}
if !newline_str.is_empty() {
builder.token(SyntaxKind::NEWLINE.into(), newline_str);
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_try_parse_html_comment() {
assert_eq!(
try_parse_html_block_start("<!-- comment -->", false),
Some(HtmlBlockType::Comment)
);
assert_eq!(
try_parse_html_block_start(" <!-- comment -->", false),
Some(HtmlBlockType::Comment)
);
}
#[test]
fn test_try_parse_div_tag() {
assert_eq!(
try_parse_html_block_start("<div>", false),
Some(HtmlBlockType::BlockTag {
tag_name: "div".to_string(),
is_verbatim: false,
closed_by_blank_line: false,
})
);
assert_eq!(
try_parse_html_block_start("<div class=\"test\">", false),
Some(HtmlBlockType::BlockTag {
tag_name: "div".to_string(),
is_verbatim: false,
closed_by_blank_line: false,
})
);
}
#[test]
fn test_try_parse_script_tag() {
assert_eq!(
try_parse_html_block_start("<script>", false),
Some(HtmlBlockType::BlockTag {
tag_name: "script".to_string(),
is_verbatim: true,
closed_by_blank_line: false,
})
);
}
#[test]
fn test_try_parse_processing_instruction() {
assert_eq!(
try_parse_html_block_start("<?xml version=\"1.0\"?>", false),
Some(HtmlBlockType::ProcessingInstruction)
);
}
#[test]
fn test_try_parse_declaration() {
assert_eq!(
try_parse_html_block_start("<!DOCTYPE html>", false),
Some(HtmlBlockType::Declaration)
);
}
#[test]
fn test_try_parse_cdata() {
assert_eq!(
try_parse_html_block_start("<![CDATA[content]]>", false),
Some(HtmlBlockType::CData)
);
}
#[test]
fn test_extract_block_tag_name_open_only() {
assert_eq!(
extract_block_tag_name("<div>", false),
Some("div".to_string())
);
assert_eq!(
extract_block_tag_name("<div class=\"test\">", false),
Some("div".to_string())
);
assert_eq!(
extract_block_tag_name("<div/>", false),
Some("div".to_string())
);
assert_eq!(extract_block_tag_name("</div>", false), None);
assert_eq!(extract_block_tag_name("<>", false), None);
assert_eq!(extract_block_tag_name("< div>", false), None);
}
#[test]
fn test_extract_block_tag_name_with_closing() {
assert_eq!(
extract_block_tag_name("</div>", true),
Some("div".to_string())
);
assert_eq!(
extract_block_tag_name("</div >", true),
Some("div".to_string())
);
}
#[test]
fn test_commonmark_type6_closing_tag_start() {
assert_eq!(
try_parse_html_block_start("</div>", true),
Some(HtmlBlockType::BlockTag {
tag_name: "div".to_string(),
is_verbatim: false,
closed_by_blank_line: true,
})
);
}
#[test]
fn test_commonmark_type7_open_tag() {
assert_eq!(
try_parse_html_block_start("<a href=\"foo\">", true),
Some(HtmlBlockType::Type7)
);
assert_eq!(try_parse_html_block_start("<a href=\"foo\">", false), None);
}
#[test]
fn test_commonmark_type7_close_tag() {
assert_eq!(
try_parse_html_block_start("</ins>", true),
Some(HtmlBlockType::Type7)
);
}
#[test]
fn test_commonmark_type7_rejects_with_trailing_text() {
assert_eq!(try_parse_html_block_start("<a> hi", true), None);
}
#[test]
fn test_is_closing_marker_comment() {
let block_type = HtmlBlockType::Comment;
assert!(is_closing_marker("-->", &block_type));
assert!(is_closing_marker("end -->", &block_type));
assert!(!is_closing_marker("<!--", &block_type));
}
#[test]
fn test_is_closing_marker_tag() {
let block_type = HtmlBlockType::BlockTag {
tag_name: "div".to_string(),
is_verbatim: false,
closed_by_blank_line: false,
};
assert!(is_closing_marker("</div>", &block_type));
assert!(is_closing_marker("</DIV>", &block_type)); assert!(is_closing_marker("content</div>", &block_type));
assert!(!is_closing_marker("<div>", &block_type));
}
#[test]
fn test_parse_html_comment_block() {
let input = "<!-- comment -->\n";
let lines: Vec<&str> = input.lines().collect();
let mut builder = GreenNodeBuilder::new();
let block_type = try_parse_html_block_start(lines[0], false).unwrap();
let new_pos = parse_html_block(&mut builder, &lines, 0, block_type, 0);
assert_eq!(new_pos, 1);
}
#[test]
fn test_parse_div_block() {
let input = "<div>\ncontent\n</div>\n";
let lines: Vec<&str> = input.lines().collect();
let mut builder = GreenNodeBuilder::new();
let block_type = try_parse_html_block_start(lines[0], false).unwrap();
let new_pos = parse_html_block(&mut builder, &lines, 0, block_type, 0);
assert_eq!(new_pos, 3);
}
#[test]
fn test_parse_html_block_no_closing() {
let input = "<div>\ncontent\n";
let lines: Vec<&str> = input.lines().collect();
let mut builder = GreenNodeBuilder::new();
let block_type = try_parse_html_block_start(lines[0], false).unwrap();
let new_pos = parse_html_block(&mut builder, &lines, 0, block_type, 0);
assert_eq!(new_pos, 2);
}
#[test]
fn test_commonmark_type6_blank_line_terminates() {
let input = "<div>\nfoo\n\nbar\n";
let lines: Vec<&str> = input.lines().collect();
let mut builder = GreenNodeBuilder::new();
let block_type = try_parse_html_block_start(lines[0], true).unwrap();
let new_pos = parse_html_block(&mut builder, &lines, 0, block_type, 0);
assert_eq!(new_pos, 2);
}
}