pub const BLOCK_HTML_TAGS: &[&str] = &[
"address",
"article",
"aside",
"audio",
"blockquote",
"body",
"button",
"canvas",
"caption",
"col",
"colgroup",
"datalist",
"dd",
"details",
"dialog",
"div",
"dl",
"dt",
"fieldset",
"figcaption",
"figure",
"footer",
"form",
"h1",
"h2",
"h3",
"h4",
"h5",
"h6",
"head",
"header",
"hgroup",
"hr",
"html",
"iframe",
"label",
"legend",
"li",
"link",
"main",
"map",
"menu",
"meta",
"nav",
"noscript",
"ol",
"optgroup",
"option",
"output",
"p",
"picture",
"pre",
"progress",
"script",
"section",
"select",
"source",
"span",
"style",
"summary",
"table",
"tbody",
"td",
"template",
"textarea",
"tfoot",
"th",
"thead",
"title",
"tr",
"track",
"ul",
"video",
];
const TEMPLATE_BREAK_KEYWORDS: &[&str] = &[
"if", "else", "else if", "endif",
"for", "endfor",
"macro", "endmacro",
"block", "endblock",
"filter", "endfilter",
"with", "endwith",
"raw", "endraw",
"include", "extends", "import",
"match", "endmatch",
"when",
];
fn compute_raw_spans(html: &str) -> Vec<(usize, usize)> {
let bytes = html.as_bytes();
let len = bytes.len();
let mut i = 0usize;
let mut spans: Vec<(usize, usize)> = Vec::new();
while i < len {
if bytes[i..].starts_with(b"{#") {
let end = find_close(html, i + 2, "#}").unwrap_or(len);
spans.push((i, end + 2));
i = end + 2;
continue;
}
if bytes[i..].starts_with(b"<!--") {
let end = find_close(html, i + 4, "-->").unwrap_or(len);
spans.push((i, end + 3));
i = end + 3;
continue;
}
if bytes[i..].starts_with(b"{%") {
if let Some(tag_end) = find_close(html, i + 2, "%}") {
let inner = html[i + 2..tag_end].trim();
if inner == "raw" || inner.starts_with("raw ") || inner.starts_with("raw\t") {
let raw_end = html[tag_end + 2..]
.find("{% endraw %}")
.or_else(|| html[tag_end + 2..].find("{%endraw%}"))
.map(|o| tag_end + 2 + o)
.unwrap_or(len);
spans.push((i, raw_end));
i = raw_end;
continue;
}
i = tag_end + 2;
continue;
}
}
if bytes[i] == b'<' {
let mut advanced = false;
for tag in &["pre", "script", "style"] {
let open = format!("<{}", tag);
if html[i..].starts_with(&open) {
let close_tag = format!("</{}>", tag);
let block_end = html[i..]
.find(close_tag.as_str())
.map(|o| i + o)
.unwrap_or(len);
spans.push((i, (block_end + close_tag.len()).min(len)));
i = (block_end + close_tag.len()).min(len);
advanced = true;
break;
}
}
if advanced {
continue;
}
}
i += 1;
}
spans
}
fn is_in_raw_span(spans: &[(usize, usize)], pos: usize) -> bool {
let idx = spans.partition_point(|&(start, _)| start < pos);
idx > 0 && spans[idx - 1].1 > pos
}
fn find_close(s: &str, from: usize, needle: &str) -> Option<usize> {
s[from..].find(needle).map(|o| from + o)
}
pub fn expand(html: &str) -> String {
let raw1 = compute_raw_spans(html);
let out = break_html_tags(html, BLOCK_HTML_TAGS, &raw1);
let raw2 = compute_raw_spans(&out);
let out = break_template_tags(&out, TEMPLATE_BREAK_KEYWORDS, &raw2);
collapse_blank_lines(&out)
}
fn break_html_tags(html: &str, tags: &[&str], spans: &[(usize, usize)]) -> String {
let mut out = String::with_capacity(html.len() + 256);
let bytes = html.as_bytes();
let len = bytes.len();
let mut i = 0usize;
while i < len {
if bytes[i] == b'<' {
let rest = &html[i..];
if let Some((matched, byte_len)) = match_html_block_tag(rest, tags) {
if is_in_raw_span(spans, i) {
out.push(b'<' as char);
i += 1;
continue;
}
let already_own_line = out
.rfind('\n')
.map(|nl| out[nl + 1..].chars().all(char::is_whitespace))
.unwrap_or(out.is_empty());
if !already_own_line {
while out.ends_with([' ', '\t']) {
out.pop();
}
out.push('\n');
}
out.push_str(&matched);
i += byte_len;
if i < len && bytes[i] != b'\n' {
out.push('\n');
}
continue;
}
}
if bytes[i].is_ascii() {
out.push(bytes[i] as char);
i += 1;
} else {
let ch = html[i..].chars().next().unwrap();
out.push(ch);
i += ch.len_utf8();
}
}
out
}
fn match_html_block_tag(s: &str, tags: &[&str]) -> Option<(String, usize)> {
if !s.starts_with('<') {
return None;
}
let rest = &s[1..];
let (_closing, rest2) = if let Some(stripped) = rest.strip_prefix('/') {
(true, stripped)
} else {
(false, rest)
};
let name_end = rest2
.find(|c: char| !c.is_alphanumeric() && c != '-')
.unwrap_or(rest2.len());
if name_end == 0 {
return None;
}
let name = rest2[..name_end].to_lowercase();
if !tags.contains(&name.as_str()) {
return None;
}
let close_byte = super::find_html_tag_close(s)?;
let matched = &s[..close_byte + 1];
Some((matched.to_string(), close_byte + 1))
}
fn break_template_tags(html: &str, kws: &[&str], spans: &[(usize, usize)]) -> String {
let mut out = String::with_capacity(html.len() + 256);
let bytes = html.as_bytes();
let len = bytes.len();
let mut i = 0usize; let mut in_html_open_tag = false;
let mut html_attr_quote: Option<u8> = None;
while i < len {
if in_html_open_tag {
let b = bytes[i];
match html_attr_quote {
Some(q) if b == q => {
html_attr_quote = None;
let ch = html[i..].chars().next().unwrap();
out.push(ch);
i += ch.len_utf8();
continue;
}
Some(_) => {
let ch = html[i..].chars().next().unwrap();
out.push(ch);
i += ch.len_utf8();
continue;
}
None => {
if b == b'"' || b == b'\'' {
html_attr_quote = Some(b);
} else if b == b'>' {
in_html_open_tag = false;
} else if b == b'{' && i + 1 < len && bytes[i + 1] == b'%' {
if let Some(tag_end) = find_template_tag_end(html, i + 2) {
out.push_str(&html[i..tag_end]);
i = tag_end;
continue;
}
} else if b == b'{' && i + 1 < len && bytes[i + 1] == b'{' {
if let Some(end) = find_close(html, i + 2, "}}") {
out.push_str(&html[i..end + 2]);
i = end + 2;
continue;
}
}
let ch = html[i..].chars().next().unwrap();
out.push(ch);
i += ch.len_utf8();
continue;
}
}
}
if bytes[i] == b'<'
&& i + 1 < len
&& bytes[i + 1].is_ascii_alphabetic()
&& !is_in_raw_span(spans, i)
{
in_html_open_tag = true;
html_attr_quote = None;
}
if i + 1 < len && bytes[i] == b'{' && bytes[i + 1] == b'%' {
if is_in_raw_span(spans, i) {
let ch = html[i..].chars().next().unwrap();
out.push(ch);
i += ch.len_utf8();
continue;
}
if let Some(tag_end) = find_template_tag_end(html, i + 2) {
let full_tag = &html[i..tag_end];
let inner =
html[i + 2..tag_end - 2].trim_matches(|c| c == '-' || c == '+' || c == '~');
let keyword = extract_keyword(inner);
let should_break = kws.iter().any(|&k| {
keyword == k
|| (keyword.len() > k.len()
&& keyword.starts_with(k)
&& matches!(keyword.as_bytes()[k.len()], b' ' | b'\t'))
});
if should_break {
let already_own_line = out
.rfind('\n')
.map(|nl| out[nl + 1..].chars().all(char::is_whitespace))
.unwrap_or(out.is_empty());
if !already_own_line {
while out.ends_with([' ', '\t']) {
out.pop();
}
if !out.ends_with('\n') {
out.push('\n');
}
} else {
while out.ends_with([' ', '\t']) {
out.pop();
}
}
out.push_str(full_tag);
i = tag_end;
if i < len && bytes[i] != b'\n' {
out.push('\n');
}
continue;
}
}
}
let ch = html[i..].chars().next().unwrap();
out.push(ch);
i += ch.len_utf8();
}
out
}
fn find_template_tag_end(html: &str, from: usize) -> Option<usize> {
let bytes = html.as_bytes();
let len = bytes.len();
let mut i = from;
let mut in_quote: Option<u8> = None;
while i < len {
match in_quote {
Some(q) if bytes[i] == q => {
in_quote = None;
i += 1;
}
Some(_) => {
i += 1;
}
None => {
if bytes[i] == b'"' || bytes[i] == b'\'' {
in_quote = Some(bytes[i]);
i += 1;
} else if i + 1 < len && bytes[i] == b'%' && bytes[i + 1] == b'}' {
return Some(i + 2);
} else {
i += 1;
}
}
}
}
None
}
fn extract_keyword(inner: &str) -> &str {
let trimmed =
inner.trim_start_matches(|c: char| c == '-' || c == '+' || c == '~' || c.is_whitespace());
if trimmed.starts_with("else if") {
return "else if";
}
trimmed.split_whitespace().next().unwrap_or("")
.trim_end_matches(['-', '+', '~'])
}
fn collapse_blank_lines(html: &str) -> String {
let mut result = String::with_capacity(html.len());
let mut consecutive_newlines = 0u32;
for ch in html.chars() {
if ch == '\n' {
consecutive_newlines += 1;
if consecutive_newlines <= 2 {
result.push(ch);
}
} else {
consecutive_newlines = 0;
result.push(ch);
}
}
result
}