use skyscraper::html::{self, grammar::document_builder::DocumentBuilder};
use crate::test_framework;
#[test]
fn data_state_basic_text() {
let text = "<html><body>hello world</body></html>";
let document = html::parse(text).unwrap();
let expected = DocumentBuilder::new()
.add_element("html", |html| {
html.add_element("head", |head| head)
.add_element("body", |body| {
body.add_text("hello world")
})
})
.build()
.unwrap();
assert!(test_framework::compare_documents(expected, document, true));
}
#[test]
fn data_state_special_characters_preserved() {
let text = "<html><body>a\tb\nc</body></html>";
let document = html::parse(text).unwrap();
let output = document.to_string();
assert!(output.contains("a\tb\nc"),
"tab and newline should be preserved. Got: {}", output);
}
#[test]
fn data_state_less_than_sign_transitions_to_tag_open() {
let text = "<html><body><div>content</div></body></html>";
let document = html::parse(text).unwrap();
let expected = DocumentBuilder::new()
.add_element("html", |html| {
html.add_element("head", |head| head)
.add_element("body", |body| {
body.add_element("div", |div| div.add_text("content"))
})
})
.build()
.unwrap();
assert!(test_framework::compare_documents(expected, document, true));
}
#[test]
fn named_character_reference_basic_entities() {
let text = r##"<html><body><p>&<>"'</p></body></html>"##;
let document = html::parse(text).unwrap();
let expected = DocumentBuilder::new()
.add_element("html", |html| {
html.add_element("head", |head| head)
.add_element("body", |body| {
body.add_element("p", |p| p.add_text("&<>\"'"))
})
})
.build()
.unwrap();
assert!(test_framework::compare_documents(expected, document, true));
}
#[test]
fn named_character_reference_nbsp() {
let text = "<html><body><p>a b</p></body></html>";
let document = html::parse(text).unwrap();
let expected = DocumentBuilder::new()
.add_element("html", |html| {
html.add_element("head", |head| head)
.add_element("body", |body| {
body.add_element("p", |p| p.add_text("a\u{00A0}b"))
})
})
.build()
.unwrap();
assert!(test_framework::compare_documents(expected, document, true));
}
#[test]
fn named_character_reference_copy_sign() {
let text = "<html><body>©</body></html>";
let document = html::parse(text).unwrap();
let expected = DocumentBuilder::new()
.add_element("html", |html| {
html.add_element("head", |head| head)
.add_element("body", |body| {
body.add_text("\u{00A9}")
})
})
.build()
.unwrap();
assert!(test_framework::compare_documents(expected, document, true));
}
#[test]
fn decimal_character_reference_basic() {
let text = "<html><body>Aa0</body></html>";
let document = html::parse(text).unwrap();
let expected = DocumentBuilder::new()
.add_element("html", |html| {
html.add_element("head", |head| head)
.add_element("body", |body| {
body.add_text("Aa0")
})
})
.build()
.unwrap();
assert!(test_framework::compare_documents(expected, document, true));
}
#[test]
fn decimal_character_reference_multibyte() {
let text = "<html><body>©—</body></html>";
let document = html::parse(text).unwrap();
let expected = DocumentBuilder::new()
.add_element("html", |html| {
html.add_element("head", |head| head)
.add_element("body", |body| {
body.add_text("\u{00A9}\u{2014}")
})
})
.build()
.unwrap();
assert!(test_framework::compare_documents(expected, document, true));
}
#[test]
fn decimal_character_reference_null_replaced_with_fffd() {
let text = "<html><body>�</body></html>";
let document = html::parse(text).unwrap();
let expected = DocumentBuilder::new()
.add_element("html", |html| {
html.add_element("head", |head| head)
.add_element("body", |body| {
body.add_text("\u{FFFD}")
})
})
.build()
.unwrap();
assert!(test_framework::compare_documents(expected, document, true));
}
#[test]
fn hexadecimal_character_reference_lowercase_x() {
let text = "<html><body>Aa0</body></html>";
let document = html::parse(text).unwrap();
let expected = DocumentBuilder::new()
.add_element("html", |html| {
html.add_element("head", |head| head)
.add_element("body", |body| {
body.add_text("Aa0")
})
})
.build()
.unwrap();
assert!(test_framework::compare_documents(expected, document, true));
}
#[test]
fn hexadecimal_character_reference_uppercase_x() {
let text = "<html><body>AZ</body></html>";
let document = html::parse(text).unwrap();
let expected = DocumentBuilder::new()
.add_element("html", |html| {
html.add_element("head", |head| head)
.add_element("body", |body| {
body.add_text("AZ")
})
})
.build()
.unwrap();
assert!(test_framework::compare_documents(expected, document, true));
}
#[test]
fn hexadecimal_character_reference_mixed_case_digits() {
let text = "<html><body>ÿÿ</body></html>";
let document = html::parse(text).unwrap();
let expected = DocumentBuilder::new()
.add_element("html", |html| {
html.add_element("head", |head| head)
.add_element("body", |body| {
body.add_text("\u{00FF}\u{00FF}")
})
})
.build()
.unwrap();
assert!(test_framework::compare_documents(expected, document, true));
}
#[test]
fn hexadecimal_character_reference_multibyte_unicode() {
let text = "<html><body>—😀</body></html>";
let document = html::parse(text).unwrap();
let expected = DocumentBuilder::new()
.add_element("html", |html| {
html.add_element("head", |head| head)
.add_element("body", |body| {
body.add_text("\u{2014}\u{1F600}")
})
})
.build()
.unwrap();
assert!(test_framework::compare_documents(expected, document, true));
}
#[test]
fn hexadecimal_character_reference_null_replaced_with_fffd() {
let text = "<html><body>�</body></html>";
let document = html::parse(text).unwrap();
let expected = DocumentBuilder::new()
.add_element("html", |html| {
html.add_element("head", |head| head)
.add_element("body", |body| {
body.add_text("\u{FFFD}")
})
})
.build()
.unwrap();
assert!(test_framework::compare_documents(expected, document, true));
}
#[test]
fn hexadecimal_character_reference_missing_semicolon() {
let text = "<html><body>A end</body></html>";
let document = html::parse(text).unwrap();
let output = document.to_string();
assert!(output.contains('A'),
"hex char ref without semicolon should still produce character. Got: {}", output);
}
#[test]
fn hexadecimal_character_reference_control_character_replacement() {
let text = "<html><body>€</body></html>";
let document = html::parse(text).unwrap();
let expected = DocumentBuilder::new()
.add_element("html", |html| {
html.add_element("head", |head| head)
.add_element("body", |body| {
body.add_text("\u{20AC}")
})
})
.build()
.unwrap();
assert!(test_framework::compare_documents(expected, document, true));
}
#[test]
fn decimal_character_reference_control_character_replacement_0x8d() {
let text = "<html><body></body></html>";
let document = html::parse(text).unwrap();
let expected = DocumentBuilder::new()
.add_element("html", |html| {
html.add_element("head", |head| head)
.add_element("body", |body| {
body.add_text("\u{008D}")
})
})
.build()
.unwrap();
assert!(test_framework::compare_documents(expected, document, true));
}
#[test]
fn character_reference_in_attribute_value() {
let text = r#"<html><body><a href="?a=1&b=2">link</a></body></html>"#;
let document = html::parse(text).unwrap();
let expected = DocumentBuilder::new()
.add_element("html", |html| {
html.add_element("head", |head| head)
.add_element("body", |body| {
body.add_element("a", |a| {
a.add_attribute_str("href", "?a=1&b=2")
.add_text("link")
})
})
})
.build()
.unwrap();
assert!(test_framework::compare_documents(expected, document, true));
}
#[test]
fn hexadecimal_character_reference_in_attribute() {
let text = r#"<html><body><div data-val="ABC">x</div></body></html>"#;
let document = html::parse(text).unwrap();
let expected = DocumentBuilder::new()
.add_element("html", |html| {
html.add_element("head", |head| head)
.add_element("body", |body| {
body.add_element("div", |div| {
div.add_attribute_str("data-val", "ABC")
.add_text("x")
})
})
})
.build()
.unwrap();
assert!(test_framework::compare_documents(expected, document, true));
}
#[test]
fn numeric_character_reference_no_digits_hex() {
let text = "<html><body>&#x;</body></html>";
let document = html::parse(text).unwrap();
let output = document.to_string();
assert!(output.contains("&#x;"),
"&#x; with no digits should be emitted literally (& escaped). Got: {}", output);
}
#[test]
fn numeric_character_reference_no_digits_decimal() {
let text = "<html><body>&#;</body></html>";
let document = html::parse(text).unwrap();
let output = document.to_string();
assert!(output.contains("&#;"),
"&#; with no digits should be emitted literally (& escaped). Got: {}", output);
}
#[test]
fn rcdata_title_basic_text() {
let text = "<html><head><title>Page Title</title></head><body></body></html>";
let document = html::parse(text).unwrap();
let expected = DocumentBuilder::new()
.add_element("html", |html| {
html.add_element("head", |head| {
head.add_element("title", |title| title.add_text("Page Title"))
})
.add_element("body", |body| body)
})
.build()
.unwrap();
assert!(test_framework::compare_documents(expected, document, true));
}
#[test]
fn rcdata_title_with_html_entities_not_parsed_as_tags() {
let text = "<html><head><title>Hello <b>World</b></title></head><body></body></html>";
let document = html::parse(text).unwrap();
let expected = DocumentBuilder::new()
.add_element("html", |html| {
html.add_element("head", |head| {
head.add_element("title", |title| {
title.add_text("Hello <b>World</b>")
})
})
.add_element("body", |body| body)
})
.build()
.unwrap();
assert!(test_framework::compare_documents(expected, document, true));
}
#[test]
fn rcdata_title_character_references_resolved() {
let text = "<html><head><title>A & B</title></head><body></body></html>";
let document = html::parse(text).unwrap();
let expected = DocumentBuilder::new()
.add_element("html", |html| {
html.add_element("head", |head| {
head.add_element("title", |title| title.add_text("A & B"))
})
.add_element("body", |body| body)
})
.build()
.unwrap();
assert!(test_framework::compare_documents(expected, document, true));
}
#[test]
fn rcdata_title_with_less_than_not_followed_by_slash() {
let text = "<html><head><title>a < b</title></head><body></body></html>";
let document = html::parse(text).unwrap();
let expected = DocumentBuilder::new()
.add_element("html", |html| {
html.add_element("head", |head| {
head.add_element("title", |title| title.add_text("a < b"))
})
.add_element("body", |body| body)
})
.build()
.unwrap();
assert!(test_framework::compare_documents(expected, document, true));
}
#[test]
fn rcdata_title_wrong_end_tag_not_closed() {
let text = "<html><head><title>foo </div> bar</title></head><body></body></html>";
let document = html::parse(text).unwrap();
let expected = DocumentBuilder::new()
.add_element("html", |html| {
html.add_element("head", |head| {
head.add_element("title", |title| {
title.add_text("foo </div> bar")
})
})
.add_element("body", |body| body)
})
.build()
.unwrap();
assert!(test_framework::compare_documents(expected, document, true));
}
#[test]
fn rcdata_textarea_basic() {
let text = "<html><body><textarea>some text</textarea></body></html>";
let document = html::parse(text).unwrap();
let expected = DocumentBuilder::new()
.add_element("html", |html| {
html.add_element("head", |head| head)
.add_element("body", |body| {
body.add_element("textarea", |ta| ta.add_text("some text"))
})
})
.build()
.unwrap();
assert!(test_framework::compare_documents(expected, document, true));
}
#[test]
fn rcdata_textarea_ignores_tags() {
let text = "<html><body><textarea><p>not a paragraph</p></textarea></body></html>";
let document = html::parse(text).unwrap();
let expected = DocumentBuilder::new()
.add_element("html", |html| {
html.add_element("head", |head| head)
.add_element("body", |body| {
body.add_element("textarea", |ta| {
ta.add_text("<p>not a paragraph</p>")
})
})
})
.build()
.unwrap();
assert!(test_framework::compare_documents(expected, document, true));
}
#[test]
fn rcdata_textarea_hex_character_reference() {
let text = "<html><body><textarea>AB</textarea></body></html>";
let document = html::parse(text).unwrap();
let expected = DocumentBuilder::new()
.add_element("html", |html| {
html.add_element("head", |head| head)
.add_element("body", |body| {
body.add_element("textarea", |ta| ta.add_text("AB"))
})
})
.build()
.unwrap();
assert!(test_framework::compare_documents(expected, document, true));
}
#[test]
fn rawtext_style_basic() {
let text = "<html><head><style>body { color: red; }</style></head><body></body></html>";
let document = html::parse(text).unwrap();
let expected = DocumentBuilder::new()
.add_element("html", |html| {
html.add_element("head", |head| {
head.add_element("style", |style| {
style.add_text("body { color: red; }")
})
})
.add_element("body", |body| body)
})
.build()
.unwrap();
assert!(test_framework::compare_documents(expected, document, true));
}
#[test]
fn rawtext_style_with_html_like_content() {
let text = "<html><head><style>div > p { color: red; }</style></head><body></body></html>";
let document = html::parse(text).unwrap();
let expected = DocumentBuilder::new()
.add_element("html", |html| {
html.add_element("head", |head| {
head.add_element("style", |style| {
style.add_text("div > p { color: red; }")
})
})
.add_element("body", |body| body)
})
.build()
.unwrap();
assert!(test_framework::compare_documents(expected, document, true));
}
#[test]
fn rawtext_style_character_references_not_resolved() {
let text = "<html><head><style>& not resolved</style></head><body></body></html>";
let document = html::parse(text).unwrap();
let expected = DocumentBuilder::new()
.add_element("html", |html| {
html.add_element("head", |head| {
head.add_element("style", |style| {
style.add_text("& not resolved")
})
})
.add_element("body", |body| body)
})
.build()
.unwrap();
assert!(test_framework::compare_documents(expected, document, true));
}
#[test]
fn rawtext_style_with_less_than_sign() {
let text = "<html><head><style>a < b { }</style></head><body></body></html>";
let document = html::parse(text).unwrap();
let expected = DocumentBuilder::new()
.add_element("html", |html| {
html.add_element("head", |head| {
head.add_element("style", |style| {
style.add_text("a < b { }")
})
})
.add_element("body", |body| body)
})
.build()
.unwrap();
assert!(test_framework::compare_documents(expected, document, true));
}
#[test]
fn rawtext_style_wrong_end_tag_treated_as_text() {
let text = "<html><head><style>body </div> { }</style></head><body></body></html>";
let document = html::parse(text).unwrap();
let expected = DocumentBuilder::new()
.add_element("html", |html| {
html.add_element("head", |head| {
head.add_element("style", |style| {
style.add_text("body </div> { }")
})
})
.add_element("body", |body| body)
})
.build()
.unwrap();
assert!(test_framework::compare_documents(expected, document, true));
}
#[test]
fn rawtext_style_end_tag_case_insensitive() {
let text = "<html><head><style>css content</STYLE></head><body></body></html>";
let document = html::parse(text).unwrap();
let expected = DocumentBuilder::new()
.add_element("html", |html| {
html.add_element("head", |head| {
head.add_element("style", |style| {
style.add_text("css content")
})
})
.add_element("body", |body| body)
})
.build()
.unwrap();
assert!(test_framework::compare_documents(expected, document, true));
}
#[test]
fn rawtext_style_partial_end_tag_treated_as_text() {
let text = "<html><head><style></sty content</style></head><body></body></html>";
let document = html::parse(text).unwrap();
let expected = DocumentBuilder::new()
.add_element("html", |html| {
html.add_element("head", |head| {
head.add_element("style", |style| {
style.add_text("</sty content")
})
})
.add_element("body", |body| body)
})
.build()
.unwrap();
assert!(test_framework::compare_documents(expected, document, true));
}
#[test]
fn rawtext_style_multiline_css() {
let text = "<html><head><style>\nbody {\n color: red;\n}\n</style></head><body></body></html>";
let document = html::parse(text).unwrap();
let expected = DocumentBuilder::new()
.add_element("html", |html| {
html.add_element("head", |head| {
head.add_element("style", |style| {
style.add_text("\nbody {\n color: red;\n}\n")
})
})
.add_element("body", |body| body)
})
.build()
.unwrap();
assert!(test_framework::compare_documents(expected, document, true));
}
#[test]
fn rawtext_noframes_basic() {
let text = "<html><head><noframes><p>not parsed</p></noframes></head><body></body></html>";
let document = html::parse(text).unwrap();
let expected = DocumentBuilder::new()
.add_element("html", |html| {
html.add_element("head", |head| {
head.add_element("noframes", |nf| {
nf.add_text("<p>not parsed</p>")
})
})
.add_element("body", |body| body)
})
.build()
.unwrap();
assert!(test_framework::compare_documents(expected, document, true));
}
#[test]
fn script_data_basic_content() {
let text = "<html><head><script>var x = 1;</script></head><body></body></html>";
let document = html::parse(text).unwrap();
let expected = DocumentBuilder::new()
.add_element("html", |html| {
html.add_element("head", |head| {
head.add_element("script", |script| {
script.add_text("var x = 1;")
})
})
.add_element("body", |body| body)
})
.build()
.unwrap();
assert!(test_framework::compare_documents(expected, document, true));
}
#[test]
fn script_data_with_html_like_content() {
let text = "<html><head><script>if (a < b && c > d) {}</script></head><body></body></html>";
let document = html::parse(text).unwrap();
let expected = DocumentBuilder::new()
.add_element("html", |html| {
html.add_element("head", |head| {
head.add_element("script", |script| {
script.add_text("if (a < b && c > d) {}")
})
})
.add_element("body", |body| body)
})
.build()
.unwrap();
assert!(test_framework::compare_documents(expected, document, true));
}
#[test]
fn script_data_character_references_not_resolved() {
let text = "<html><head><script>&</script></head><body></body></html>";
let document = html::parse(text).unwrap();
let expected = DocumentBuilder::new()
.add_element("html", |html| {
html.add_element("head", |head| {
head.add_element("script", |script| {
script.add_text("&")
})
})
.add_element("body", |body| body)
})
.build()
.unwrap();
assert!(test_framework::compare_documents(expected, document, true));
}
#[test]
fn script_data_wrong_end_tag_treated_as_text() {
let text = "<html><head><script>x </div> y</script></head><body></body></html>";
let document = html::parse(text).unwrap();
let expected = DocumentBuilder::new()
.add_element("html", |html| {
html.add_element("head", |head| {
head.add_element("script", |script| {
script.add_text("x </div> y")
})
})
.add_element("body", |body| body)
})
.build()
.unwrap();
assert!(test_framework::compare_documents(expected, document, true));
}
#[test]
fn script_data_multiple_scripts() {
let text = "<html><head><script>var a=1;</script><script>var b=2;</script></head><body></body></html>";
let document = html::parse(text).unwrap();
let expected = DocumentBuilder::new()
.add_element("html", |html| {
html.add_element("head", |head| {
head.add_element("script", |s| s.add_text("var a=1;"))
.add_element("script", |s| s.add_text("var b=2;"))
})
.add_element("body", |body| body)
})
.build()
.unwrap();
assert!(test_framework::compare_documents(expected, document, true));
}
#[test]
fn tag_open_basic_start_tag() {
let text = "<html><body><div></div></body></html>";
let document = html::parse(text).unwrap();
let expected = DocumentBuilder::new()
.add_element("html", |html| {
html.add_element("head", |head| head)
.add_element("body", |body| {
body.add_element("div", |div| div)
})
})
.build()
.unwrap();
assert!(test_framework::compare_documents(expected, document, true));
}
#[test]
fn tag_name_case_insensitivity() {
let text = "<HTML><BODY><DIV>text</DIV></BODY></HTML>";
let document = html::parse(text).unwrap();
let expected = DocumentBuilder::new()
.add_element("html", |html| {
html.add_element("head", |head| head)
.add_element("body", |body| {
body.add_element("div", |div| div.add_text("text"))
})
})
.build()
.unwrap();
assert!(test_framework::compare_documents(expected, document, true));
}
#[test]
fn self_closing_void_element_br() {
let text = "<html><body><br/></body></html>";
let document = html::parse(text).unwrap();
let expected = DocumentBuilder::new()
.add_element("html", |html| {
html.add_element("head", |head| head)
.add_element("body", |body| {
body.add_element("br", |br| br)
})
})
.build()
.unwrap();
assert!(test_framework::compare_documents(expected, document, true));
}
#[test]
fn self_closing_void_element_br_without_slash() {
let text = "<html><body><br></body></html>";
let document = html::parse(text).unwrap();
let expected = DocumentBuilder::new()
.add_element("html", |html| {
html.add_element("head", |head| head)
.add_element("body", |body| {
body.add_element("br", |br| br)
})
})
.build()
.unwrap();
assert!(test_framework::compare_documents(expected, document, true));
}
#[test]
fn attributes_single_quoted() {
let text = "<html><body><div id='foo'>x</div></body></html>";
let document = html::parse(text).unwrap();
let expected = DocumentBuilder::new()
.add_element("html", |html| {
html.add_element("head", |head| head)
.add_element("body", |body| {
body.add_element("div", |div| {
div.add_attribute_str("id", "foo").add_text("x")
})
})
})
.build()
.unwrap();
assert!(test_framework::compare_documents(expected, document, true));
}
#[test]
fn attributes_double_quoted() {
let text = r#"<html><body><div class="bar">x</div></body></html>"#;
let document = html::parse(text).unwrap();
let expected = DocumentBuilder::new()
.add_element("html", |html| {
html.add_element("head", |head| head)
.add_element("body", |body| {
body.add_element("div", |div| {
div.add_attribute_str("class", "bar").add_text("x")
})
})
})
.build()
.unwrap();
assert!(test_framework::compare_documents(expected, document, true));
}
#[test]
fn attributes_unquoted() {
let text = "<html><body><div id=foo>x</div></body></html>";
let document = html::parse(text).unwrap();
let expected = DocumentBuilder::new()
.add_element("html", |html| {
html.add_element("head", |head| head)
.add_element("body", |body| {
body.add_element("div", |div| {
div.add_attribute_str("id", "foo").add_text("x")
})
})
})
.build()
.unwrap();
assert!(test_framework::compare_documents(expected, document, true));
}
#[test]
fn attributes_boolean_no_value() {
let text = "<html><body><input disabled></body></html>";
let document = html::parse(text).unwrap();
let expected = DocumentBuilder::new()
.add_element("html", |html| {
html.add_element("head", |head| head)
.add_element("body", |body| {
body.add_element("input", |input| {
input.add_attribute_str("disabled", "")
})
})
})
.build()
.unwrap();
assert!(test_framework::compare_documents(expected, document, true));
}
#[test]
fn attributes_multiple_with_whitespace() {
let text = r#"<html><body><div id="a" class="b" data-x="c">x</div></body></html>"#;
let document = html::parse(text).unwrap();
let expected = DocumentBuilder::new()
.add_element("html", |html| {
html.add_element("head", |head| head)
.add_element("body", |body| {
body.add_element("div", |div| {
div.add_attribute_str("id", "a")
.add_attribute_str("class", "b")
.add_attribute_str("data-x", "c")
.add_text("x")
})
})
})
.build()
.unwrap();
assert!(test_framework::compare_documents(expected, document, true));
}
#[test]
fn attribute_name_case_insensitivity() {
let text = r#"<html><body><div ID="foo">x</div></body></html>"#;
let document = html::parse(text).unwrap();
let expected = DocumentBuilder::new()
.add_element("html", |html| {
html.add_element("head", |head| head)
.add_element("body", |body| {
body.add_element("div", |div| {
div.add_attribute_str("id", "foo").add_text("x")
})
})
})
.build()
.unwrap();
assert!(test_framework::compare_documents(expected, document, true));
}
#[test]
fn attribute_with_character_reference_in_double_quoted_value() {
let text = r#"<html><body><div data-json="{"key":"val"}">x</div></body></html>"#;
let document = html::parse(text).unwrap();
let expected = DocumentBuilder::new()
.add_element("html", |html| {
html.add_element("head", |head| head)
.add_element("body", |body| {
body.add_element("div", |div| {
div.add_attribute_str("data-json", r#"{"key":"val"}"#)
.add_text("x")
})
})
})
.build()
.unwrap();
assert!(test_framework::compare_documents(expected, document, true));
}
#[test]
fn comment_basic() {
let text = "<html><body><!-- hello --></body></html>";
let document = html::parse(text).unwrap();
let expected = DocumentBuilder::new()
.add_element("html", |html| {
html.add_element("head", |head| head)
.add_element("body", |body| {
body.add_comment(" hello ")
})
})
.build()
.unwrap();
assert!(test_framework::compare_documents(expected, document, true));
}
#[test]
fn comment_empty() {
let text = "<html><body><!----></body></html>";
let document = html::parse(text).unwrap();
let expected = DocumentBuilder::new()
.add_element("html", |html| {
html.add_element("head", |head| head)
.add_element("body", |body| {
body.add_comment("")
})
})
.build()
.unwrap();
assert!(test_framework::compare_documents(expected, document, true));
}
#[test]
fn comment_with_dashes() {
let text = "<html><body><!-- a-b--c ---></body></html>";
let document = html::parse(text).unwrap();
let output = document.to_string();
assert!(output.contains("<!-- a-b--c --->"),
"comment with extra dashes should be preserved. Got: {}", output);
}
#[test]
fn comment_with_less_than() {
let text = "<html><body><!-- <div> --></body></html>";
let document = html::parse(text).unwrap();
let expected = DocumentBuilder::new()
.add_element("html", |html| {
html.add_element("head", |head| head)
.add_element("body", |body| {
body.add_comment(" <div> ")
})
})
.build()
.unwrap();
assert!(test_framework::compare_documents(expected, document, true));
}
#[test]
fn comment_multiple() {
let text = "<html><body><!-- first --><!-- second --></body></html>";
let document = html::parse(text).unwrap();
let expected = DocumentBuilder::new()
.add_element("html", |html| {
html.add_element("head", |head| head)
.add_element("body", |body| {
body.add_comment(" first ")
.add_comment(" second ")
})
})
.build()
.unwrap();
assert!(test_framework::compare_documents(expected, document, true));
}
#[test]
fn comment_before_html() {
let text = "<!-- before --><html><body></body></html>";
let document = html::parse(text).unwrap();
let output = document.to_string();
assert!(output.contains("<!-- before -->"),
"comment before html should be preserved. Got: {}", output);
}
#[test]
fn doctype_html5() {
let text = "<!DOCTYPE html><html><head></head><body></body></html>";
let document = html::parse(text).unwrap();
let expected = DocumentBuilder::new()
.add_doctype("html")
.add_element("html", |html| {
html.add_element("head", |head| head)
.add_element("body", |body| body)
})
.build()
.unwrap();
assert!(test_framework::compare_documents(expected, document, true));
}
#[test]
fn doctype_case_insensitive() {
let text = "<!doctype html><html><head></head><body></body></html>";
let document = html::parse(text).unwrap();
let expected = DocumentBuilder::new()
.add_doctype("html")
.add_element("html", |html| {
html.add_element("head", |head| head)
.add_element("body", |body| body)
})
.build()
.unwrap();
assert!(test_framework::compare_documents(expected, document, true));
}
#[test]
fn doctype_with_public_identifier() {
let text = r#"<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"><html><head></head><body></body></html>"#;
let document = html::parse(text).unwrap();
let expected = DocumentBuilder::new()
.add_doctype_full(
"html",
Some("-//W3C//DTD XHTML 1.0 Strict//EN"),
Some("http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"),
)
.add_element("html", |html| {
html.add_element("head", |head| head)
.add_element("body", |body| body)
})
.build()
.unwrap();
assert!(test_framework::compare_documents(expected, document, true));
}
#[test]
fn doctype_with_system_identifier() {
let text = r#"<!DOCTYPE html SYSTEM "about:legacy-compat"><html><head></head><body></body></html>"#;
let document = html::parse(text).unwrap();
let expected = DocumentBuilder::new()
.add_doctype_full("html", None, Some("about:legacy-compat"))
.add_element("html", |html| {
html.add_element("head", |head| head)
.add_element("body", |body| body)
})
.build()
.unwrap();
assert!(test_framework::compare_documents(expected, document, true));
}
#[test]
fn implicit_html_head_body() {
let text = "<div>hello</div>";
let document = html::parse(text).unwrap();
let expected = DocumentBuilder::new()
.add_element("html", |html| {
html.add_element("head", |head| head)
.add_element("body", |body| {
body.add_element("div", |div| div.add_text("hello"))
})
})
.build()
.unwrap();
assert!(test_framework::compare_documents(expected, document, true));
}
#[test]
fn implicit_head_body_with_only_text() {
let text = "just text";
let document = html::parse(text).unwrap();
let expected = DocumentBuilder::new()
.add_element("html", |html| {
html.add_element("head", |head| head)
.add_element("body", |body| {
body.add_text("just text")
})
})
.build()
.unwrap();
assert!(test_framework::compare_documents(expected, document, true));
}
#[test]
fn bogus_comment_exclamation_mark() {
let text = "<html><body><!X bogus></body></html>";
let _document = html::parse(text).unwrap();
}
#[test]
fn mixed_text_and_elements() {
let text = "<html><body>before<span>middle</span>after</body></html>";
let document = html::parse(text).unwrap();
let expected = DocumentBuilder::new()
.add_element("html", |html| {
html.add_element("head", |head| head)
.add_element("body", |body| {
body.add_text("before")
.add_element("span", |span| span.add_text("middle"))
.add_text("after")
})
})
.build()
.unwrap();
assert!(test_framework::compare_documents(expected, document, true));
}
#[test]
fn nested_elements() {
let text = "<html><body><div><span><a>deep</a></span></div></body></html>";
let document = html::parse(text).unwrap();
let expected = DocumentBuilder::new()
.add_element("html", |html| {
html.add_element("head", |head| head)
.add_element("body", |body| {
body.add_element("div", |div| {
div.add_element("span", |span| {
span.add_element("a", |a| a.add_text("deep"))
})
})
})
})
.build()
.unwrap();
assert!(test_framework::compare_documents(expected, document, true));
}
#[test]
fn empty_document() {
let text = "";
let document = html::parse(text).unwrap();
let output = document.to_string();
assert_eq!(output, "<html><head></head><body></body></html>");
}
#[test]
fn whitespace_only_document() {
let text = " \n\t ";
let _document = html::parse(text).unwrap();
}
#[test]
fn multiple_character_references_in_sequence() {
let text = "<html><body>&&&</body></html>";
let document = html::parse(text).unwrap();
let expected = DocumentBuilder::new()
.add_element("html", |html| {
html.add_element("head", |head| head)
.add_element("body", |body| {
body.add_text("&&&")
})
})
.build()
.unwrap();
assert!(test_framework::compare_documents(expected, document, true));
}
#[test]
fn character_reference_at_end_of_text() {
let text = "<html><body>hello&</body></html>";
let document = html::parse(text).unwrap();
let expected = DocumentBuilder::new()
.add_element("html", |html| {
html.add_element("head", |head| head)
.add_element("body", |body| {
body.add_text("hello&")
})
})
.build()
.unwrap();
assert!(test_framework::compare_documents(expected, document, true));
}
#[test]
fn decimal_and_hex_character_references_mixed() {
let text = "<html><body>ABCD</body></html>";
let document = html::parse(text).unwrap();
let expected = DocumentBuilder::new()
.add_element("html", |html| {
html.add_element("head", |head| head)
.add_element("body", |body| {
body.add_text("ABCD")
})
})
.build()
.unwrap();
assert!(test_framework::compare_documents(expected, document, true));
}
#[test]
fn ambiguous_ampersand_not_a_reference() {
let text = "<html><body>&xyz hello</body></html>";
let document = html::parse(text).unwrap();
let output = document.to_string();
assert!(output.contains("&xyz hello"),
"ambiguous ampersand should be preserved. Got: {}", output);
}
#[test]
fn ampersand_followed_by_space() {
let text = "<html><body>a & b</body></html>";
let document = html::parse(text).unwrap();
let expected = DocumentBuilder::new()
.add_element("html", |html| {
html.add_element("head", |head| head)
.add_element("body", |body| {
body.add_text("a & b")
})
})
.build()
.unwrap();
assert!(test_framework::compare_documents(expected, document, true));
}
#[test]
fn end_tag_with_attributes_ignored() {
let text = r#"<html><body><div>text</div id="x"></body></html>"#;
let document = html::parse(text).unwrap();
let expected = DocumentBuilder::new()
.add_element("html", |html| {
html.add_element("head", |head| head)
.add_element("body", |body| {
body.add_element("div", |div| div.add_text("text"))
})
})
.build()
.unwrap();
assert!(test_framework::compare_documents(expected, document, true));
}
#[test]
fn complex_page_structure() {
let text = r#"<!DOCTYPE html>
<html>
<head>
<title>Test & Page</title>
<style>body { color: red; }</style>
<script>var x = '<div>';</script>
</head>
<body>
<div id="main" class="container">
<h1>Hello & World</h1>
<p>Paragraph with <em>emphasis</em> and <strong>strong</strong>.</p>
<!-- navigation -->
<a href="?a=1&b=2">Link</a>
</div>
</body>
</html>"#;
let document = html::parse(text).unwrap();
let output = document.to_string();
assert!(output.contains("<title>Test & Page</title>"),
"title should have resolved character reference. Got: {}", output);
assert!(output.contains("body { color: red; }"),
"style RAWTEXT content should be preserved. Got: {}", output);
assert!(output.contains("var x = '<div>';"),
"script content should be preserved (with < > escaped in serialization). Got: {}", output);
assert!(output.contains("Hello & World"),
"hex char ref & should resolve to &. Got: {}", output);
assert!(output.contains("<!-- navigation -->"),
"comment should be preserved. Got: {}", output);
assert!(output.contains("?a=1&b=2"),
"attribute char ref should be resolved. Got: {}", output);
}
#[test]
fn style_followed_by_content() {
let text = "<html><head><style>.x{}</style></head><body><p>text</p></body></html>";
let document = html::parse(text).unwrap();
let expected = DocumentBuilder::new()
.add_element("html", |html| {
html.add_element("head", |head| {
head.add_element("style", |style| {
style.add_text(".x{}")
})
})
.add_element("body", |body| {
body.add_element("p", |p| p.add_text("text"))
})
})
.build()
.unwrap();
assert!(test_framework::compare_documents(expected, document, true));
}
#[test]
fn title_followed_by_content() {
let text = "<html><head><title>t</title></head><body><div>content</div></body></html>";
let document = html::parse(text).unwrap();
let expected = DocumentBuilder::new()
.add_element("html", |html| {
html.add_element("head", |head| {
head.add_element("title", |title| title.add_text("t"))
})
.add_element("body", |body| {
body.add_element("div", |div| div.add_text("content"))
})
})
.build()
.unwrap();
assert!(test_framework::compare_documents(expected, document, true));
}
#[test]
fn script_followed_by_content() {
let text = "<html><head><script>x</script></head><body><p>after</p></body></html>";
let document = html::parse(text).unwrap();
let expected = DocumentBuilder::new()
.add_element("html", |html| {
html.add_element("head", |head| {
head.add_element("script", |s| s.add_text("x"))
})
.add_element("body", |body| {
body.add_element("p", |p| p.add_text("after"))
})
})
.build()
.unwrap();
assert!(test_framework::compare_documents(expected, document, true));
}
#[test]
fn hex_character_reference_surrogate_replaced_with_fffd() {
let text = "<html><body>�</body></html>";
let document = html::parse(text).unwrap();
let expected = DocumentBuilder::new()
.add_element("html", |html| {
html.add_element("head", |head| head)
.add_element("body", |body| {
body.add_text("\u{FFFD}")
})
})
.build()
.unwrap();
assert!(test_framework::compare_documents(expected, document, true));
}
#[test]
fn hex_character_reference_outside_unicode_replaced_with_fffd() {
let text = "<html><body>�</body></html>";
let document = html::parse(text).unwrap();
let expected = DocumentBuilder::new()
.add_element("html", |html| {
html.add_element("head", |head| head)
.add_element("body", |body| {
body.add_text("\u{FFFD}")
})
})
.build()
.unwrap();
assert!(test_framework::compare_documents(expected, document, true));
}
#[test]
fn decimal_character_reference_missing_semicolon() {
let text = "<html><body>A end</body></html>";
let document = html::parse(text).unwrap();
let output = document.to_string();
assert!(output.contains('A'),
"decimal char ref without semicolon should still produce character. Got: {}", output);
}
#[test]
fn numeric_char_ref_windows_1252_replacements() {
let text = "<html><body>€“”–—</body></html>";
let document = html::parse(text).unwrap();
let expected = DocumentBuilder::new()
.add_element("html", |html| {
html.add_element("head", |head| head)
.add_element("body", |body| {
body.add_text("\u{20AC}\u{201C}\u{201D}\u{2013}\u{2014}")
})
})
.build()
.unwrap();
assert!(test_framework::compare_documents(expected, document, true));
}
#[test]
fn rawtext_style_end_tag_with_space_before_close() {
let text = "<html><head><style>css</style ></head><body></body></html>";
let document = html::parse(text).unwrap();
let expected = DocumentBuilder::new()
.add_element("html", |html| {
html.add_element("head", |head| {
head.add_element("style", |style| style.add_text("css"))
})
.add_element("body", |body| body)
})
.build()
.unwrap();
assert!(test_framework::compare_documents(expected, document, true));
}
#[test]
fn rawtext_style_slash_not_followed_by_tag_name() {
let text = "<html><head><style></1 content</style></head><body></body></html>";
let document = html::parse(text).unwrap();
let expected = DocumentBuilder::new()
.add_element("html", |html| {
html.add_element("head", |head| {
head.add_element("style", |style| {
style.add_text("</1 content")
})
})
.add_element("body", |body| body)
})
.build()
.unwrap();
assert!(test_framework::compare_documents(expected, document, true));
}
#[test]
fn rawtext_style_empty() {
let text = "<html><head><style></style></head><body></body></html>";
let document = html::parse(text).unwrap();
let expected = DocumentBuilder::new()
.add_element("html", |html| {
html.add_element("head", |head| {
head.add_element("style", |style| style)
})
.add_element("body", |body| body)
})
.build()
.unwrap();
assert!(test_framework::compare_documents(expected, document, true));
}
#[test]
fn rcdata_title_end_tag_case_insensitive() {
let text = "<html><head><title>text</TITLE></head><body></body></html>";
let document = html::parse(text).unwrap();
let expected = DocumentBuilder::new()
.add_element("html", |html| {
html.add_element("head", |head| {
head.add_element("title", |title| title.add_text("text"))
})
.add_element("body", |body| body)
})
.build()
.unwrap();
assert!(test_framework::compare_documents(expected, document, true));
}
#[test]
fn rcdata_title_end_tag_with_space() {
let text = "<html><head><title>text</title ></head><body></body></html>";
let document = html::parse(text).unwrap();
let expected = DocumentBuilder::new()
.add_element("html", |html| {
html.add_element("head", |head| {
head.add_element("title", |title| title.add_text("text"))
})
.add_element("body", |body| body)
})
.build()
.unwrap();
assert!(test_framework::compare_documents(expected, document, true));
}
#[test]
fn rcdata_title_partial_end_tag_treated_as_text() {
let text = "<html><head><title></titl content</title></head><body></body></html>";
let document = html::parse(text).unwrap();
let expected = DocumentBuilder::new()
.add_element("html", |html| {
html.add_element("head", |head| {
head.add_element("title", |title| {
title.add_text("</titl content")
})
})
.add_element("body", |body| body)
})
.build()
.unwrap();
assert!(test_framework::compare_documents(expected, document, true));
}
#[test]
fn rcdata_title_empty() {
let text = "<html><head><title></title></head><body></body></html>";
let document = html::parse(text).unwrap();
let expected = DocumentBuilder::new()
.add_element("html", |html| {
html.add_element("head", |head| {
head.add_element("title", |title| title)
})
.add_element("body", |body| body)
})
.build()
.unwrap();
assert!(test_framework::compare_documents(expected, document, true));
}
#[test]
fn script_data_with_html_comment() {
let text = "<html><head><script><!--\nvar x = 1;\n--></script></head><body></body></html>";
let document = html::parse(text).unwrap();
let expected = DocumentBuilder::new()
.add_element("html", |html| {
html.add_element("head", |head| {
head.add_element("script", |script| {
script.add_text("<!--\nvar x = 1;\n-->")
})
})
.add_element("body", |body| body)
})
.build()
.unwrap();
assert!(test_framework::compare_documents(expected, document, true));
}
#[test]
fn script_data_escaped_with_close_tag() {
let text = "<html><head><script>var x=1;</script></head><body>after</body></html>";
let document = html::parse(text).unwrap();
let output = document.to_string();
assert!(output.contains("<script>var x=1;</script>"),
"script should close at </script>. Got: {}", output);
assert!(output.contains("after"),
"content after script should be parsed. Got: {}", output);
}
#[test]
fn tab_line_feed_form_feed_in_text() {
let text = "<html><body>\t\n\x0C</body></html>";
let document = html::parse(text).unwrap();
let output = document.to_string();
assert!(output.contains("\t") || output.contains("\n"),
"whitespace characters should be preserved. Got: {:?}", output);
}
#[test]
fn named_character_reference_hearts() {
let text = "<html><body>♥</body></html>";
let document = html::parse(text).unwrap();
let expected = DocumentBuilder::new()
.add_element("html", |html| {
html.add_element("head", |head| head)
.add_element("body", |body| {
body.add_text("\u{2665}")
})
})
.build()
.unwrap();
assert!(test_framework::compare_documents(expected, document, true));
}
#[test]
fn named_character_reference_two_codepoint() {
let text = "<html><body>≪⃒</body></html>";
let document = html::parse(text).unwrap();
let output = document.to_string();
assert!(output.contains('\u{226A}'),
"two-codepoint named ref should produce first codepoint. Got: {:?}", output);
}
#[test]
fn attribute_empty_value_double_quoted() {
let text = r#"<html><body><div id="">x</div></body></html>"#;
let document = html::parse(text).unwrap();
let expected = DocumentBuilder::new()
.add_element("html", |html| {
html.add_element("head", |head| head)
.add_element("body", |body| {
body.add_element("div", |div| {
div.add_attribute_str("id", "").add_text("x")
})
})
})
.build()
.unwrap();
assert!(test_framework::compare_documents(expected, document, true));
}
#[test]
fn attribute_empty_value_single_quoted() {
let text = "<html><body><div id=''>x</div></body></html>";
let document = html::parse(text).unwrap();
let expected = DocumentBuilder::new()
.add_element("html", |html| {
html.add_element("head", |head| head)
.add_element("body", |body| {
body.add_element("div", |div| {
div.add_attribute_str("id", "").add_text("x")
})
})
})
.build()
.unwrap();
assert!(test_framework::compare_documents(expected, document, true));
}
#[test]
fn attribute_with_hex_char_ref_in_single_quoted() {
let text = "<html><body><div data-x='A'>x</div></body></html>";
let document = html::parse(text).unwrap();
let expected = DocumentBuilder::new()
.add_element("html", |html| {
html.add_element("head", |head| head)
.add_element("body", |body| {
body.add_element("div", |div| {
div.add_attribute_str("data-x", "A").add_text("x")
})
})
})
.build()
.unwrap();
assert!(test_framework::compare_documents(expected, document, true));
}
#[test]
fn attribute_with_decimal_char_ref_in_unquoted() {
let text = "<html><body><div data-x=A>x</div></body></html>";
let document = html::parse(text).unwrap();
let expected = DocumentBuilder::new()
.add_element("html", |html| {
html.add_element("head", |head| head)
.add_element("body", |body| {
body.add_element("div", |div| {
div.add_attribute_str("data-x", "A").add_text("x")
})
})
})
.build()
.unwrap();
assert!(test_framework::compare_documents(expected, document, true));
}
#[test]
fn duplicate_attributes_first_wins() {
let text = r#"<html><head></head><body><div class="a" class="b">text</div></body></html>"#;
let document = html::parse(text).unwrap();
let expected = DocumentBuilder::new()
.add_element("html", |html| {
html.add_element("head", |head| head)
.add_element("body", |body| {
body.add_element("div", |div| {
div.add_attribute_str("class", "a").add_text("text")
})
})
})
.build()
.unwrap();
assert!(test_framework::compare_documents(expected, document, true));
}
#[test]
fn duplicate_attributes_multiple_different_values() {
let text = r#"<html><head></head><body><p id="first" id="second" id="third">text</p></body></html>"#;
let document = html::parse(text).unwrap();
let expected = DocumentBuilder::new()
.add_element("html", |html| {
html.add_element("head", |head| head)
.add_element("body", |body| {
body.add_element("p", |p| {
p.add_attribute_str("id", "first").add_text("text")
})
})
})
.build()
.unwrap();
assert!(test_framework::compare_documents(expected, document, true));
}
#[test]
fn non_duplicate_attributes_all_preserved() {
let text =
r#"<html><head></head><body><div id="a" class="b" data-x="c">text</div></body></html>"#;
let document = html::parse(text).unwrap();
let expected = DocumentBuilder::new()
.add_element("html", |html| {
html.add_element("head", |head| head)
.add_element("body", |body| {
body.add_element("div", |div| {
div.add_attribute_str("id", "a")
.add_attribute_str("class", "b")
.add_attribute_str("data-x", "c")
.add_text("text")
})
})
})
.build()
.unwrap();
assert!(test_framework::compare_documents(expected, document, true));
}
#[test]
fn duplicate_attribute_first_empty_value() {
let text = r#"<html><head></head><body><div class="" class="b">text</div></body></html>"#;
let document = html::parse(text).unwrap();
let expected = DocumentBuilder::new()
.add_element("html", |html| {
html.add_element("head", |head| head)
.add_element("body", |body| {
body.add_element("div", |div| {
div.add_attribute_str("class", "").add_text("text")
})
})
})
.build()
.unwrap();
assert!(test_framework::compare_documents(expected, document, true));
}
#[test]
fn duplicate_attribute_case_insensitive() {
let text = r#"<html><head></head><body><div CLASS="a" class="b">text</div></body></html>"#;
let document = html::parse(text).unwrap();
let expected = DocumentBuilder::new()
.add_element("html", |html| {
html.add_element("head", |head| head)
.add_element("body", |body| {
body.add_element("div", |div| {
div.add_attribute_str("class", "a").add_text("text")
})
})
})
.build()
.unwrap();
assert!(test_framework::compare_documents(expected, document, true));
}
#[test]
fn named_char_ref_without_semicolon_in_body() {
let text = "<html><body><p>Æ</p></body></html>";
let document = html::parse(text).unwrap();
let expected = DocumentBuilder::new()
.add_element("html", |html| {
html.add_element("head", |head| head)
.add_element("body", |body| {
body.add_element("p", |p| p.add_text("\u{00C6}"))
})
})
.build()
.unwrap();
assert!(test_framework::compare_documents(expected, document, true));
}
#[test]
fn named_char_ref_without_semicolon_in_attribute_followed_by_alpha() {
let text = r#"<html><body><a href="?a=Æx">link</a></body></html>"#;
let document = html::parse(text).unwrap();
let output = document.to_string();
assert!(
output.contains("&AEligx") || output.contains("Æx"),
"Named char ref without semicolon followed by alpha in attribute should be literal: {output:?}"
);
}
#[test]
fn named_char_ref_without_semicolon_in_attribute_followed_by_equals() {
let text = r#"<html><body><a href="?Æ=1">link</a></body></html>"#;
let document = html::parse(text).unwrap();
let output = document.to_string();
assert!(
output.contains("&AElig=") || output.contains("Æ="),
"Named char ref without semicolon followed by = in attribute should be literal: {output:?}"
);
}
#[test]
fn named_char_ref_with_semicolon_always_resolves() {
let text_body = "<html><body><p>Æ</p></body></html>";
let doc_body = html::parse(text_body).unwrap();
let expected_body = DocumentBuilder::new()
.add_element("html", |html| {
html.add_element("head", |head| head)
.add_element("body", |body| {
body.add_element("p", |p| p.add_text("\u{00C6}"))
})
})
.build()
.unwrap();
assert!(test_framework::compare_documents(
expected_body,
doc_body,
true
));
let text_attr = r#"<html><body><a href="?a=Æx">link</a></body></html>"#;
let doc_attr = html::parse(text_attr).unwrap();
let output_attr = doc_attr.to_string();
assert!(
output_attr.contains("\u{00C6}x"),
"Named char ref with semicolon should always resolve: {output_attr:?}"
);
}
#[test]
fn tag_open_state_unexpected_char_does_not_emit_eof() {
let text = "<html><body><3 hearts</body></html>";
let document = html::parse(text).unwrap();
let output = document.to_string();
assert!(
output.contains("3 hearts"),
"Unexpected char after '<' should reconsume in data state, preserving text: {output:?}"
);
assert!(
output.contains("</body>"),
"Document should parse to completion: {output:?}"
);
}
#[test]
fn tag_open_state_question_mark_creates_bogus_comment() {
let text = "<html><body><?xml version='1.0'?></body></html>";
let document = html::parse(text).unwrap();
let output = document.to_string();
assert!(
output.contains("<body>"),
"Document should still have a body after bogus comment: {output:?}"
);
assert!(
!output.contains("<?xml"),
"Bogus comment content should not appear as text: {output:?}"
);
}