use crate::html::entities::reverse_lookup_entity;
use crate::html::{is_raw_text_element, is_void_element};
use crate::tree::{Document, NodeId, NodeKind};
#[must_use]
pub fn serialize_html(doc: &Document) -> String {
let mut output = String::new();
let reencode = !detect_utf8_charset(doc);
for child in doc.children(doc.root()) {
serialize_html_node(doc, child, &mut output, reencode);
}
if !output.ends_with('\n') {
output.push('\n');
}
output
}
#[must_use]
pub fn serialize_html5(doc: &Document) -> String {
let mut output = String::new();
for child in doc.children(doc.root()) {
serialize_html5_node(doc, child, &mut output);
}
if !output.ends_with('\n') {
output.push('\n');
}
output
}
fn detect_utf8_charset(doc: &Document) -> bool {
let root = doc.root();
for id in doc.children(root) {
if check_meta_charset(doc, id) {
return true;
}
}
false
}
fn check_meta_charset(doc: &Document, id: NodeId) -> bool {
if let NodeKind::Element {
name, attributes, ..
} = &doc.node(id).kind
{
if name == "meta" {
for attr in attributes {
if attr.name == "charset" && attr.value.eq_ignore_ascii_case("utf-8") {
return true;
}
}
let is_content_type = attributes
.iter()
.any(|a| a.name == "http-equiv" && a.value.eq_ignore_ascii_case("content-type"));
if is_content_type {
for attr in attributes {
if attr.name == "content" {
let lower = attr.value.to_ascii_lowercase();
if lower.contains("charset=utf-8") {
return true;
}
}
}
}
}
for child in doc.children(id) {
if check_meta_charset(doc, child) {
return true;
}
}
}
false
}
fn is_inline_element(tag: &str) -> bool {
matches!(
tag,
"a" | "abbr"
| "acronym"
| "b"
| "bdo"
| "big"
| "br"
| "cite"
| "code"
| "dfn"
| "em"
| "font"
| "i"
| "img"
| "input"
| "kbd"
| "label"
| "q"
| "s"
| "samp"
| "select"
| "small"
| "span"
| "strike"
| "strong"
| "sub"
| "sup"
| "textarea"
| "tt"
| "u"
| "var"
)
}
fn is_text_like(kind: &NodeKind) -> bool {
matches!(
kind,
NodeKind::Text { .. } | NodeKind::CData { .. } | NodeKind::EntityRef { .. }
)
}
fn maybe_newline_after_open(doc: &Document, id: NodeId, tag: &str, out: &mut String) {
if is_inline_element(tag) || tag.starts_with('p') {
return;
}
let Some(first) = doc.first_child(id) else {
return;
};
if is_text_like(&doc.node(first).kind) {
return;
}
if doc.next_sibling(first).is_none() {
return;
}
out.push('\n');
}
fn maybe_newline_before_close(doc: &Document, id: NodeId, tag: &str, out: &mut String) {
if is_inline_element(tag) || tag.starts_with('p') {
return;
}
let Some(first) = doc.first_child(id) else {
return;
};
let Some(last) = doc.last_child(id) else {
return;
};
if is_text_like(&doc.node(last).kind) {
return;
}
if doc.next_sibling(first).is_none() {
return;
}
out.push('\n');
}
fn maybe_newline_after_close(doc: &Document, id: NodeId, tag: &str, out: &mut String) {
if is_inline_element(tag) {
return;
}
let Some(next) = doc.next_sibling(id) else {
return;
};
if is_text_like(&doc.node(next).kind) {
return;
}
if let Some(parent) = doc.parent(id) {
let parent_name = doc.node_name(parent).unwrap_or("");
if parent_name.starts_with('p') {
return;
}
}
out.push('\n');
}
#[allow(clippy::too_many_lines)]
fn serialize_html_node(doc: &Document, id: NodeId, out: &mut String, reencode: bool) {
match &doc.node(id).kind {
NodeKind::Element {
name,
prefix,
attributes,
..
} => {
out.push('<');
if let Some(pfx) = prefix {
out.push_str(pfx);
out.push(':');
}
out.push_str(name);
for attr in attributes {
out.push(' ');
if let Some(pfx) = &attr.prefix {
out.push_str(pfx);
out.push(':');
}
out.push_str(&attr.name);
if attr.value != attr.name {
if attr.value.contains('"') && !attr.value.contains('\'') {
out.push_str("='");
write_html_escaped_attr_sq(out, &attr.value, reencode);
out.push('\'');
} else {
out.push_str("=\"");
if is_uri_attribute(&attr.name) {
write_html_uri_attr(out, &attr.value, reencode);
} else {
write_html_escaped_attr(out, &attr.value, reencode);
}
out.push('"');
}
}
}
out.push('>');
let lower = name.to_ascii_lowercase();
if is_void_element(&lower) {
maybe_newline_after_close(doc, id, &lower, out);
return;
}
maybe_newline_after_open(doc, id, &lower, out);
if is_raw_text_element(&lower) {
for child in doc.children(id) {
if let NodeKind::Text { content } = &doc.node(child).kind {
out.push_str(content);
} else {
serialize_html_node(doc, child, out, reencode);
}
}
} else {
for child in doc.children(id) {
serialize_html_node(doc, child, out, reencode);
}
}
maybe_newline_before_close(doc, id, &lower, out);
out.push_str("</");
if let Some(pfx) = prefix {
out.push_str(pfx);
out.push(':');
}
out.push_str(name);
out.push('>');
maybe_newline_after_close(doc, id, &lower, out);
}
NodeKind::Text { content } => {
write_html_escaped_text(out, content, reencode);
}
NodeKind::CData { content } => {
write_html_escaped_text(out, content, reencode);
}
NodeKind::Comment { content } => {
out.push_str("<!--");
out.push_str(content);
out.push_str("-->");
}
NodeKind::ProcessingInstruction { target, data } => {
out.push_str("<?");
out.push_str(target);
if let Some(d) = data {
out.push(' ');
out.push_str(d);
}
out.push('>');
}
NodeKind::EntityRef { name, .. } => {
out.push('&');
out.push_str(name);
out.push(';');
}
NodeKind::DocumentType {
name,
system_id,
public_id,
..
} => {
out.push_str("<!DOCTYPE ");
out.push_str(name);
match (public_id, system_id) {
(Some(pub_id), Some(sys_id)) => {
out.push_str(" PUBLIC \"");
out.push_str(pub_id);
out.push('"');
if !sys_id.is_empty() {
out.push_str(" \"");
out.push_str(sys_id);
out.push('"');
}
}
(Some(pub_id), None) => {
out.push_str(" PUBLIC \"");
out.push_str(pub_id);
out.push('"');
}
(None, Some(sys_id)) => {
out.push_str(" SYSTEM \"");
out.push_str(sys_id);
out.push('"');
}
_ => {}
}
out.push_str(">\n");
}
NodeKind::Document => {
}
}
}
fn write_html_escaped_text(out: &mut String, text: &str, reencode: bool) {
for ch in text.chars() {
match ch {
'&' => out.push_str("&"),
'<' => out.push_str("<"),
'>' => out.push_str(">"),
c if reencode && (c as u32) >= 0x80 => {
if let Some(name) = reverse_lookup_entity(c) {
out.push('&');
out.push_str(name);
out.push(';');
} else {
out.push(c);
}
}
_ => out.push(ch),
}
}
}
fn is_uri_attribute(name: &str) -> bool {
matches!(
name,
"href"
| "src"
| "action"
| "background"
| "cite"
| "classid"
| "codebase"
| "data"
| "longdesc"
| "profile"
| "usemap"
)
}
fn write_html_uri_attr(out: &mut String, text: &str, reencode: bool) {
for ch in text.chars() {
match ch {
'&' => out.push_str("&"),
'"' => out.push_str("""),
'<' => out.push_str("<"),
'>' => out.push_str(">"),
' ' => out.push_str("%20"),
c if reencode && (c as u32) >= 0x80 => {
if let Some(name) = reverse_lookup_entity(c) {
out.push('&');
out.push_str(name);
out.push(';');
} else {
out.push(c);
}
}
_ => out.push(ch),
}
}
}
fn write_html_escaped_attr_sq(out: &mut String, text: &str, reencode: bool) {
for ch in text.chars() {
match ch {
'&' => out.push_str("&"),
'\'' => out.push_str("'"),
'<' => out.push_str("<"),
'>' => out.push_str(">"),
c if reencode && (c as u32) >= 0x80 => {
if let Some(name) = reverse_lookup_entity(c) {
out.push('&');
out.push_str(name);
out.push(';');
} else {
out.push(c);
}
}
_ => out.push(ch),
}
}
}
fn write_html_escaped_attr(out: &mut String, text: &str, reencode: bool) {
for ch in text.chars() {
match ch {
'&' => out.push_str("&"),
'"' => out.push_str("""),
'<' => out.push_str("<"),
'>' => out.push_str(">"),
c if reencode && (c as u32) >= 0x80 => {
if let Some(name) = reverse_lookup_entity(c) {
out.push('&');
out.push_str(name);
out.push(';');
} else {
out.push(c);
}
}
_ => out.push(ch),
}
}
}
fn is_html5_void(tag: &str) -> bool {
matches!(
tag,
"area"
| "base"
| "br"
| "col"
| "embed"
| "hr"
| "img"
| "input"
| "link"
| "meta"
| "source"
| "track"
| "wbr"
)
}
fn is_html5_raw_text(tag: &str) -> bool {
matches!(tag, "script" | "style")
}
fn serialize_html5_node(doc: &Document, id: NodeId, out: &mut String) {
match &doc.node(id).kind {
NodeKind::Element {
name,
namespace,
attributes,
..
} => {
let is_foreign = namespace.as_deref().is_some_and(|ns| {
ns == "http://www.w3.org/2000/svg" || ns == "http://www.w3.org/1998/Math/MathML"
});
out.push('<');
out.push_str(name);
for attr in attributes {
out.push(' ');
if let Some(pfx) = &attr.prefix {
out.push_str(pfx);
out.push(':');
}
out.push_str(&attr.name);
out.push_str("=\"");
write_html5_escaped_attr(out, &attr.value);
out.push('"');
}
let lower = name.to_ascii_lowercase();
if !is_foreign && is_html5_void(&lower) {
out.push('>');
return;
}
if is_foreign && doc.first_child(id).is_none() {
out.push_str("/>");
return;
}
out.push('>');
if is_html5_raw_text(&lower) {
for child in doc.children(id) {
if let NodeKind::Text { content } = &doc.node(child).kind {
out.push_str(content);
}
}
} else {
for child in doc.children(id) {
serialize_html5_node(doc, child, out);
}
}
out.push_str("</");
out.push_str(name);
out.push('>');
}
NodeKind::Text { content } => {
write_html5_escaped_text(out, content);
}
NodeKind::Comment { content } => {
out.push_str("<!--");
out.push_str(content);
out.push_str("-->");
}
NodeKind::DocumentType {
name,
public_id,
system_id,
..
} => {
out.push_str("<!DOCTYPE ");
out.push_str(name);
if let Some(pub_id) = public_id {
out.push_str(" PUBLIC \"");
out.push_str(pub_id);
out.push('"');
if let Some(sys_id) = system_id {
out.push_str(" \"");
out.push_str(sys_id);
out.push('"');
}
} else if let Some(sys_id) = system_id {
out.push_str(" SYSTEM \"");
out.push_str(sys_id);
out.push('"');
}
out.push_str(">\n");
}
NodeKind::ProcessingInstruction { target, data } => {
out.push_str("<?");
out.push_str(target);
if let Some(d) = data {
out.push(' ');
out.push_str(d);
}
out.push('>');
}
_ => {
for child in doc.children(id) {
serialize_html5_node(doc, child, out);
}
}
}
}
fn write_html5_escaped_text(out: &mut String, text: &str) {
for ch in text.chars() {
match ch {
'&' => out.push_str("&"),
'<' => out.push_str("<"),
'>' => out.push_str(">"),
_ => out.push(ch),
}
}
}
fn write_html5_escaped_attr(out: &mut String, text: &str) {
for ch in text.chars() {
match ch {
'&' => out.push_str("&"),
'"' => out.push_str("""),
_ => out.push(ch),
}
}
}
#[cfg(test)]
#[allow(clippy::unwrap_used)]
mod tests {
use super::*;
use crate::html::parse_html;
#[test]
fn test_void_element_br() {
let doc = parse_html("<html><body><br></body></html>").unwrap();
let html = serialize_html(&doc);
assert!(html.contains("<br>"), "expected <br>, got: {html}");
assert!(!html.contains("<br/>"), "should not have <br/>");
assert!(!html.contains("</br>"), "should not have </br>");
}
#[test]
fn test_void_element_img_with_attr() {
let doc = parse_html(r#"<html><body><img src="x.png"></body></html>"#).unwrap();
let html = serialize_html(&doc);
assert!(
html.contains(r#"<img src="x.png">"#),
"expected img with src, got: {html}"
);
assert!(!html.contains("</img>"), "void element should not close");
}
#[test]
fn test_non_void_empty_element() {
let doc = parse_html("<html><body><p></p></body></html>").unwrap();
let html = serialize_html(&doc);
assert!(
html.contains("<p></p>"),
"expected <p></p>, not self-closing, got: {html}"
);
}
#[test]
fn test_script_not_escaped() {
let doc = parse_html("<html><body><script>if (a < b) {}</script></body></html>").unwrap();
let html = serialize_html(&doc);
assert!(
html.contains("if (a < b) {}"),
"script content should not be escaped, got: {html}"
);
assert!(
!html.contains("<"),
"script content should not contain <"
);
}
#[test]
fn test_style_not_escaped() {
let doc = parse_html("<html><body><style>.a > .b {}</style></body></html>").unwrap();
let html = serialize_html(&doc);
assert!(
html.contains(".a > .b {}"),
"style content should not be escaped, got: {html}"
);
assert!(
!html.contains(">"),
"style content should not contain > inside style tag"
);
}
#[test]
fn test_boolean_attribute() {
let doc = parse_html(r#"<html><body><input disabled="disabled"></body></html>"#).unwrap();
let html = serialize_html(&doc);
assert!(
html.contains("<input disabled>") || html.contains("<input disabled "),
"expected boolean attr, got: {html}"
);
}
#[test]
fn test_regular_attribute_preserved() {
let doc = parse_html(r#"<html><body><input type="text"></body></html>"#).unwrap();
let html = serialize_html(&doc);
assert!(
html.contains(r#"type="text""#),
"expected type=\"text\", got: {html}"
);
}
#[test]
fn test_multiple_attributes() {
let doc = parse_html(
r#"<html><body><input type="text" name="field" value="hello"></body></html>"#,
)
.unwrap();
let html = serialize_html(&doc);
assert!(html.contains(r#"type="text""#), "missing type attr");
assert!(html.contains(r#"name="field""#), "missing name attr");
assert!(html.contains(r#"value="hello""#), "missing value attr");
}
#[test]
fn test_text_escaping() {
let doc = parse_html("<html><body><p>a & b < c > d</p></body></html>").unwrap();
let html = serialize_html(&doc);
assert!(
html.contains("&") && html.contains("<") && html.contains(">"),
"expected escaped entities in text, got: {html}"
);
}
#[test]
fn test_comment_preserved() {
let doc = parse_html("<html><body><!-- comment --></body></html>").unwrap();
let html = serialize_html(&doc);
assert!(
html.contains("<!-- comment -->"),
"comment should be preserved, got: {html}"
);
}
#[test]
fn test_doctype_serialization() {
let doc = parse_html(
r#"<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"><html><body></body></html>"#,
)
.unwrap();
let html = serialize_html(&doc);
assert!(
html.contains("<!DOCTYPE html"),
"expected DOCTYPE, got: {html}"
);
assert!(
html.contains("PUBLIC"),
"expected PUBLIC in DOCTYPE, got: {html}"
);
}
#[test]
fn test_meta_charset_utf8() {
let doc =
parse_html(r#"<html><head><meta charset="utf-8"></head><body>café</body></html>"#)
.unwrap();
let html = serialize_html(&doc);
assert!(
html.contains("café") || html.contains("caf"),
"UTF-8 content should be preserved, got: {html}"
);
}
#[test]
fn test_inline_element_no_newlines() {
let doc = parse_html("<html><body><p>Hello <span>world</span></p></body></html>").unwrap();
let html = serialize_html(&doc);
assert!(
html.contains("<span>world</span>"),
"inline element should not have extra newlines, got: {html}"
);
}
#[test]
fn test_trailing_newline() {
let doc = parse_html("<html><body></body></html>").unwrap();
let html = serialize_html(&doc);
assert!(
html.ends_with('\n'),
"output should end with newline, got: {html:?}"
);
}
#[test]
fn test_nested_elements() {
let doc =
parse_html("<html><body><div><ul><li>one</li><li>two</li></ul></div></body></html>")
.unwrap();
let html = serialize_html(&doc);
assert!(html.contains("<ul>"), "missing <ul>");
assert!(html.contains("<li>one</li>"), "missing first li");
assert!(html.contains("<li>two</li>"), "missing second li");
assert!(html.contains("</ul>"), "missing </ul>");
}
#[test]
fn test_entity_ref_serialization() {
let mut doc = Document::new();
let root = doc.root();
let html_id = doc.create_node(NodeKind::Element {
name: "html".to_string(),
prefix: None,
namespace: None,
attributes: vec![],
});
doc.append_child(root, html_id);
let body_id = doc.create_node(NodeKind::Element {
name: "body".to_string(),
prefix: None,
namespace: None,
attributes: vec![],
});
doc.append_child(html_id, body_id);
let entity_id = doc.create_node(NodeKind::EntityRef {
name: "nbsp".to_string(),
value: None,
});
doc.append_child(body_id, entity_id);
let html = serialize_html(&doc);
assert!(
html.contains(" "),
"entity reference should be preserved, got: {html}"
);
}
#[test]
fn test_full_html_document() {
let input =
"<html><head><title>Test</title></head><body><h1>Hello</h1><p>World</p></body></html>";
let doc = parse_html(input).unwrap();
let html = serialize_html(&doc);
assert!(html.contains("<html>"), "missing <html>");
assert!(html.contains("<head>"), "missing <head>");
assert!(html.contains("<title>Test</title>"), "missing title");
assert!(html.contains("<body>"), "missing <body>");
assert!(html.contains("Hello"), "missing h1 content");
assert!(html.contains("World"), "missing p content");
assert!(html.contains("</html>"), "missing </html>");
}
#[test]
fn test_uri_attribute_space() {
let doc = parse_html(r#"<html><body><a href="a b">link</a></body></html>"#).unwrap();
let html = serialize_html(&doc);
assert!(
html.contains("a%20b"),
"spaces in href should be encoded as %20, got: {html}"
);
}
#[test]
fn test_attr_with_quotes() {
let mut doc = Document::new();
let root = doc.root();
let html_id = doc.create_node(NodeKind::Element {
name: "html".to_string(),
prefix: None,
namespace: None,
attributes: vec![],
});
doc.append_child(root, html_id);
let body_id = doc.create_node(NodeKind::Element {
name: "body".to_string(),
prefix: None,
namespace: None,
attributes: vec![],
});
doc.append_child(html_id, body_id);
let div_id = doc.create_node(NodeKind::Element {
name: "div".to_string(),
prefix: None,
namespace: None,
attributes: vec![crate::tree::Attribute {
name: "title".to_string(),
value: "say \"hello\"".to_string(),
prefix: None,
namespace: None,
raw_value: None,
}],
});
doc.append_child(body_id, div_id);
let html = serialize_html(&doc);
assert!(
html.contains("title='say \"hello\"'"),
"expected single-quoted attr, got: {html}"
);
}
#[test]
fn test_html5_basic_roundtrip() {
let doc = crate::html5::parse_html5("<p>Hello</p>").unwrap();
let html = serialize_html5(&doc);
assert!(html.contains("<p>Hello</p>"), "got: {html}");
assert!(html.contains("<html>"), "got: {html}");
}
#[test]
fn test_html5_void_elements() {
let doc = crate::html5::parse_html5("<br><hr><img src=\"x.png\">").unwrap();
let html = serialize_html5(&doc);
assert!(html.contains("<br>"), "got: {html}");
assert!(!html.contains("</br>"), "void should not close: {html}");
assert!(html.contains("<hr>"), "got: {html}");
assert!(html.contains("<img"), "got: {html}");
}
#[test]
fn test_html5_raw_text() {
let doc = crate::html5::parse_html5("<script>if (a < b) {}</script>").unwrap();
let html = serialize_html5(&doc);
assert!(
html.contains("if (a < b) {}"),
"script content should not be escaped: {html}"
);
}
#[test]
fn test_html5_preserves_utf8() {
let doc = crate::html5::parse_html5("<p>café</p>").unwrap();
let html = serialize_html5(&doc);
assert!(html.contains("café"), "UTF-8 should be preserved: {html}");
}
#[test]
fn test_html5_foreign_self_closing() {
let doc =
crate::html5::parse_html5("<svg><circle cx=\"50\" cy=\"50\" r=\"40\"/></svg>").unwrap();
let html = serialize_html5(&doc);
assert!(html.contains("<circle"), "got: {html}");
assert!(
html.contains("/>"),
"foreign empty element should self-close: {html}"
);
}
}