pub fn normalize(html: &str) -> String {
let mut s = html.to_string();
s = collapse_self_closing(&s);
s = collapse_attribute_form(&s);
s = normalize_entities(&s);
s = strip_block_boundary_whitespace(&s);
s.trim().to_string()
}
fn collapse_self_closing(s: &str) -> String {
let tags = ["br", "hr", "img", "input"];
let mut out = s.to_string();
for tag in &tags {
let bare = format!("<{}>", tag);
let slash_no_space = format!("<{}/>", tag);
let canonical = format!("<{} />", tag);
out = out.replace(&bare, &canonical);
out = out.replace(&slash_no_space, &canonical);
}
out
}
fn collapse_attribute_form(s: &str) -> String {
s.to_string()
}
fn normalize_entities(s: &str) -> String {
let mut out = s.replace("'", "'");
out = out.replace("'", "'");
out
}
fn strip_block_boundary_whitespace(s: &str) -> String {
let block_tags = [
"p", "li", "blockquote", "ul", "ol", "h1", "h2", "h3", "h4", "h5", "h6", "div",
"table", "thead", "tbody", "tr", "th", "td", "pre",
];
let mut out = s.to_string();
for tag in &block_tags {
let open_pat = format!("<{}>", tag);
let close_pat = format!("</{}>", tag);
out = squash_newlines_after(&out, &open_pat);
out = squash_newlines_before(&out, &close_pat);
}
out
}
fn squash_newlines_after(s: &str, marker: &str) -> String {
let mut out = String::with_capacity(s.len());
let mut i = 0;
let bytes = s.as_bytes();
let mbytes = marker.as_bytes();
while i < bytes.len() {
if bytes[i..].starts_with(mbytes) {
out.push_str(marker);
i += mbytes.len();
while i < bytes.len() && (bytes[i] == b'\n' || bytes[i] == b' ') {
i += 1;
}
} else {
let c_start = i;
i += 1;
while i < bytes.len() && (bytes[i] & 0b1100_0000) == 0b1000_0000 {
i += 1;
}
out.push_str(std::str::from_utf8(&bytes[c_start..i]).unwrap_or(""));
}
}
out
}
fn squash_newlines_before(s: &str, marker: &str) -> String {
let mut out = String::with_capacity(s.len());
let mut i = 0;
let bytes = s.as_bytes();
let mbytes = marker.as_bytes();
while i < bytes.len() {
if bytes[i..].starts_with(mbytes) {
while out.ends_with('\n') || out.ends_with(' ') {
out.pop();
}
out.push_str(marker);
i += mbytes.len();
} else {
let c_start = i;
i += 1;
while i < bytes.len() && (bytes[i] & 0b1100_0000) == 0b1000_0000 {
i += 1;
}
out.push_str(std::str::from_utf8(&bytes[c_start..i]).unwrap_or(""));
}
}
out
}