use skyscraper::html;
use skyscraper::xpath::grammar::XpathItemTreeNode;
#[test]
fn null_character_in_body_is_ignored() {
let text = "<html><body>hello\0world</body></html>";
let document = html::parse(text).unwrap();
let output = document.to_string();
assert!(
!output.contains('\0'),
"NULL character should be stripped from output: {output:?}"
);
assert!(
output.contains("helloworld"),
"Text on either side of the NULL should be preserved: {output:?}"
);
}
#[test]
fn multiple_null_characters_in_body_are_ignored() {
let text = "<html><body>\0a\0b\0c\0</body></html>";
let document = html::parse(text).unwrap();
let output = document.to_string();
assert!(
!output.contains('\0'),
"No NULL characters should survive: {output:?}"
);
assert!(
output.contains("abc"),
"Non-NULL characters should be preserved: {output:?}"
);
}
#[test]
fn null_character_in_nested_element_is_ignored() {
let text = "<html><body><p>be\0fore</p></body></html>";
let document = html::parse(text).unwrap();
let output = document.to_string();
assert!(
!output.contains('\0'),
"NULL should be stripped inside nested elements: {output:?}"
);
assert!(
output.contains("before"),
"Surrounding text should be intact: {output:?}"
);
}
#[test]
fn frameset_in_body_ignored_when_frameset_not_ok() {
let text = "<html><body>text<frameset></frameset></body></html>";
let document = html::parse(text).unwrap();
let output = document.to_string();
assert!(
!output.contains("<frameset"),
"frameset should be ignored when frameset-ok is false: {output:?}"
);
assert!(
output.contains("text"),
"Body content should be preserved: {output:?}"
);
}
#[test]
fn frameset_in_body_replaces_body_when_frameset_ok() {
let text = "<html><head></head><div><frameset></frameset></html>";
let document = html::parse(text).unwrap();
let output = document.to_string();
assert!(
output.contains("<frameset>"),
"frameset should be present: {output:?}"
);
}
#[test]
fn pre_start_tag_closes_p_and_inserts() {
let text = "<html><body><p>para<pre>code</pre></body></html>";
let document = html::parse(text).unwrap();
let output = document.to_string();
assert!(
output.contains("<pre>"),
"pre element should be present: {output:?}"
);
assert!(
!output.contains("<p><pre>"),
"p should not contain pre: {output:?}"
);
}
#[test]
fn listing_start_tag_closes_p_and_inserts() {
let text = "<html><body><p>para<listing>code</listing></body></html>";
let document = html::parse(text).unwrap();
let output = document.to_string();
assert!(
output.contains("<listing>"),
"listing element should be present: {output:?}"
);
assert!(
!output.contains("<p><listing>"),
"p should not contain listing: {output:?}"
);
}
#[test]
fn pre_strips_leading_newline() {
let text = "<html><body><pre>\nhello</pre></body></html>";
let document = html::parse(text).unwrap();
let output = document.to_string();
assert!(
output.contains("<pre>hello</pre>"),
"Leading newline should be stripped: {output:?}"
);
}
#[test]
fn pre_does_not_strip_non_lf() {
let text = "<html><body><pre>hello</pre></body></html>";
let document = html::parse(text).unwrap();
let output = document.to_string();
assert!(
output.contains("<pre>hello</pre>"),
"Non-LF content should be preserved: {output:?}"
);
}
#[test]
fn dd_start_tag_closes_previous_dd() {
let text = "<html><body><dl><dd>first<dd>second</dl></body></html>";
let document = html::parse(text).unwrap();
let output = document.to_string();
assert!(
output.contains("<dd>first</dd>"),
"first dd should be closed: {output:?}"
);
assert!(
output.contains("<dd>second</dd>"),
"second dd should be present: {output:?}"
);
}
#[test]
fn dt_start_tag_closes_previous_dd() {
let text = "<html><body><dl><dd>desc<dt>term</dl></body></html>";
let document = html::parse(text).unwrap();
let output = document.to_string();
assert!(
output.contains("<dd>desc</dd>"),
"dd should be closed by dt: {output:?}"
);
assert!(
output.contains("<dt>term</dt>"),
"dt should be present: {output:?}"
);
}
#[test]
fn dd_start_tag_closes_previous_dt() {
let text = "<html><body><dl><dt>term<dd>desc</dl></body></html>";
let document = html::parse(text).unwrap();
let output = document.to_string();
assert!(
output.contains("<dt>term</dt>"),
"dt should be closed by dd: {output:?}"
);
assert!(
output.contains("<dd>desc</dd>"),
"dd should be present: {output:?}"
);
}
#[test]
fn dd_start_tag_closes_p_element() {
let text = "<html><body><p>text<dd>desc</body></html>";
let document = html::parse(text).unwrap();
let output = document.to_string();
assert!(
!output.contains("<p><dd>"),
"p should not contain dd: {output:?}"
);
}
#[test]
fn dd_end_tag_closes_dd() {
let text = "<html><body><dl><dd>content</dd></dl></body></html>";
let document = html::parse(text).unwrap();
let output = document.to_string();
assert!(
output.contains("<dd>content</dd>"),
"dd should be properly closed: {output:?}"
);
}
#[test]
fn dd_end_tag_without_scope_is_ignored() {
let text = "<html><body></dd><p>text</p></body></html>";
let document = html::parse(text).unwrap();
let output = document.to_string();
assert!(
output.contains("<p>text</p>"),
"Body content should be preserved: {output:?}"
);
}
#[test]
fn dt_end_tag_closes_dt() {
let text = "<html><body><dl><dt>term</dt></dl></body></html>";
let document = html::parse(text).unwrap();
let output = document.to_string();
assert!(
output.contains("<dt>term</dt>"),
"dt should be properly closed: {output:?}"
);
}
#[test]
fn applet_start_tag_inserts_element() {
let text = "<html><body><applet>content</applet></body></html>";
let document = html::parse(text).unwrap();
let output = document.to_string();
assert!(
output.contains("<applet>content</applet>"),
"applet should be present: {output:?}"
);
}
#[test]
fn marquee_start_tag_inserts_element() {
let text = "<html><body><marquee>scrolling</marquee></body></html>";
let document = html::parse(text).unwrap();
let output = document.to_string();
assert!(
output.contains("<marquee>scrolling</marquee>"),
"marquee should be present: {output:?}"
);
}
#[test]
fn object_end_tag_without_scope_is_ignored() {
let text = "<html><body></object><p>text</p></body></html>";
let document = html::parse(text).unwrap();
let output = document.to_string();
assert!(
output.contains("<p>text</p>"),
"Body content should be preserved: {output:?}"
);
}
#[test]
fn nobr_start_tag_inserts_element() {
let text = "<html><body><nobr>no break</nobr></body></html>";
let document = html::parse(text).unwrap();
let output = document.to_string();
assert!(
output.contains("<nobr>no break</nobr>"),
"nobr element should be present: {output:?}"
);
}
#[test]
fn nested_nobr_triggers_adoption_agency() {
let text = "<html><body><nobr>first<nobr>second</nobr></body></html>";
let document = html::parse(text).unwrap();
let output = document.to_string();
assert!(
output.contains("first"),
"first text should be preserved: {output:?}"
);
assert!(
output.contains("second"),
"second text should be preserved: {output:?}"
);
}
#[test]
fn sarcasm_end_tag_uses_any_other_end_tag() {
let text = "<html><body><p>text</sarcasm></p></body></html>";
let document = html::parse(text).unwrap();
let output = document.to_string();
assert!(
output.contains("<p>text</p>"),
"Content should be preserved: {output:?}"
);
}
#[test]
fn plaintext_start_tag_inserts_and_switches_tokenizer() {
let text = "<html><body><plaintext>raw <b>not bold</b> text";
let document = html::parse(text).unwrap();
let output = document.to_string();
assert!(
output.contains("<plaintext>"),
"plaintext element should be present: {output:?}"
);
assert!(
output.contains("<b>") || output.contains("<b>not bold</b>") || output.contains("raw"),
"Content after <plaintext> should be preserved as text: {output:?}"
);
}
#[test]
fn plaintext_start_tag_closes_p() {
let text = "<html><body><p>para<plaintext>raw text";
let document = html::parse(text).unwrap();
let output = document.to_string();
assert!(
!output.contains("<p><plaintext>"),
"p should not contain plaintext: {output:?}"
);
}
#[test]
fn param_start_tag_inserts_void_element() {
let text = "<html><body><object><param name=\"movie\" value=\"test.swf\"></object></body></html>";
let document = html::parse(text).unwrap();
let output = document.to_string();
assert!(
output.contains("<param "),
"param element should be present: {output:?}"
);
}
#[test]
fn source_start_tag_inserts_void_element() {
let text = "<html><body><video><source src=\"video.mp4\"></video></body></html>";
let document = html::parse(text).unwrap();
let output = document.to_string();
assert!(
output.contains("<source "),
"source element should be present: {output:?}"
);
}
#[test]
fn track_start_tag_inserts_void_element() {
let text = "<html><body><video><track kind=\"subtitles\" src=\"subs.vtt\"></video></body></html>";
let document = html::parse(text).unwrap();
let output = document.to_string();
assert!(
output.contains("<track "),
"track element should be present: {output:?}"
);
}
#[test]
fn hr_start_tag_closes_p_and_inserts() {
let text = "<html><body><p>text<hr></body></html>";
let document = html::parse(text).unwrap();
let output = document.to_string();
assert!(
output.contains("<hr>"),
"hr element should be present: {output:?}"
);
assert!(
!output.contains("<p><hr>"),
"p should not contain hr: {output:?}"
);
}
#[test]
fn hr_start_tag_inserts_without_p() {
let text = "<html><body><div><hr></div></body></html>";
let document = html::parse(text).unwrap();
let output = document.to_string();
assert!(
output.contains("<hr>"),
"hr element should be present: {output:?}"
);
}
#[test]
fn image_start_tag_rewritten_to_img() {
let text = "<html><body><image src=\"photo.jpg\"></body></html>";
let document = html::parse(text).unwrap();
let output = document.to_string();
assert!(
output.contains("<img "),
"image should be rewritten to img: {output:?}"
);
assert!(
!output.contains("<image"),
"image tag should not appear in output: {output:?}"
);
assert!(
output.contains("photo.jpg"),
"attributes should be preserved through rewrite: {output:?}"
);
}
#[test]
fn xmp_start_tag_inserts_raw_text() {
let text = "<html><body><xmp><b>not bold</b></xmp></body></html>";
let document = html::parse(text).unwrap();
let output = document.to_string();
assert!(
output.contains("<xmp>"),
"xmp element should be present: {output:?}"
);
assert!(
output.contains("<b>"),
"Content inside xmp should be escaped as raw text: {output:?}"
);
}
#[test]
fn xmp_start_tag_closes_p() {
let text = "<html><body><p>text<xmp>code</xmp></body></html>";
let document = html::parse(text).unwrap();
let output = document.to_string();
assert!(
!output.contains("<p><xmp>"),
"p should not contain xmp: {output:?}"
);
}
#[test]
fn iframe_start_tag_inserts_raw_text() {
let text = "<html><body><iframe><b>not bold</b></iframe></body></html>";
let document = html::parse(text).unwrap();
let output = document.to_string();
assert!(
output.contains("<iframe>"),
"iframe element should be present: {output:?}"
);
}
#[test]
fn noembed_start_tag_inserts_raw_text() {
let text = "<html><body><noembed><b>not bold</b></noembed></body></html>";
let document = html::parse(text).unwrap();
let output = document.to_string();
assert!(
output.contains("<noembed>"),
"noembed element should be present: {output:?}"
);
}
#[test]
fn noscript_start_tag_inserts_normally_when_scripting_disabled() {
let text = "<html><body><noscript>fallback content</noscript></body></html>";
let document = html::parse(text).unwrap();
let output = document.to_string();
assert!(
output.contains("<noscript>fallback content</noscript>"),
"noscript element should be present with content: {output:?}"
);
}
#[test]
fn rb_start_tag_inside_ruby() {
let text = "<html><body><ruby>text<rb>base</rb></ruby></body></html>";
let document = html::parse(text).unwrap();
let output = document.to_string();
assert!(
output.contains("<rb>base</rb>"),
"rb element should be present: {output:?}"
);
}
#[test]
fn rtc_start_tag_inside_ruby() {
let text = "<html><body><ruby>text<rtc>annotation</rtc></ruby></body></html>";
let document = html::parse(text).unwrap();
let output = document.to_string();
assert!(
output.contains("<rtc>annotation</rtc>"),
"rtc element should be present: {output:?}"
);
}
#[test]
fn rt_start_tag_inside_ruby() {
let text = "<html><body><ruby>text<rt>annotation</rt></ruby></body></html>";
let document = html::parse(text).unwrap();
let output = document.to_string();
assert!(
output.contains("<rt>annotation</rt>"),
"rt element should be present: {output:?}"
);
}
#[test]
fn rp_start_tag_inside_ruby() {
let text = "<html><body><ruby>text<rp>(</rp><rt>annotation</rt><rp>)</rp></ruby></body></html>";
let document = html::parse(text).unwrap();
let output = document.to_string();
assert!(
output.contains("<rp>(</rp>"),
"first rp element should be present: {output:?}"
);
assert!(
output.contains("<rp>)</rp>"),
"second rp element should be present: {output:?}"
);
}
#[test]
fn rt_closes_open_rb() {
let text = "<html><body><ruby><rb>base<rt>annotation</ruby></body></html>";
let document = html::parse(text).unwrap();
let output = document.to_string();
assert!(
output.contains("<rb>base</rb>"),
"rb should be implicitly closed: {output:?}"
);
assert!(
output.contains("<rt>annotation</rt>"),
"rt should be present: {output:?}"
);
}
#[test]
fn math_start_tag_inserts_foreign_element() {
let text = "<html><body><math><mi>x</mi></math></body></html>";
let document = html::parse(text).unwrap();
let output = document.to_string();
assert!(
output.contains("<math>"),
"math element should be present: {output:?}"
);
}
#[test]
fn math_self_closing_pops_immediately() {
let text = "<html><body><math/><p>after</p></body></html>";
let document = html::parse(text).unwrap();
let output = document.to_string();
assert!(
output.contains("<math>"),
"math element should be present: {output:?}"
);
assert!(
output.contains("<p>after</p>"),
"content after self-closing math should be preserved: {output:?}"
);
}
#[test]
fn table_related_start_tags_in_body_are_ignored() {
let text = "<html><body><caption>text</caption><p>content</p></body></html>";
let document = html::parse(text).unwrap();
let output = document.to_string();
assert!(
output.contains("<p>content</p>"),
"Body content should be preserved: {output:?}"
);
}
#[test]
fn frame_start_tag_in_body_is_ignored() {
let text = "<html><body><frame><p>content</p></body></html>";
let document = html::parse(text).unwrap();
let output = document.to_string();
assert!(
output.contains("<p>content</p>"),
"Body content should be preserved: {output:?}"
);
}
#[test]
fn doctype_in_body_is_ignored() {
let text = "<!DOCTYPE html><html><body><!DOCTYPE html><p>text</p></body></html>";
let document = html::parse(text).unwrap();
let output = document.to_string();
assert_eq!(
output.matches("<!DOCTYPE").count(),
1,
"Only one DOCTYPE should survive; got: {output:?}"
);
assert!(
output.contains("<p>text</p>"),
"Body content should be preserved: {output:?}"
);
}
#[test]
fn math_element_has_mathml_namespace() {
let tree = html::parse("<html><body><math><mi>x</mi></math></body></html>").unwrap();
let math_element = tree
.iter()
.find_map(|node| match node {
XpathItemTreeNode::ElementNode(e) if e.name == "math" => Some(e),
_ => None,
})
.expect("math element should be present");
assert_eq!(
math_element.namespace.as_deref(),
Some("http://www.w3.org/1998/Math/MathML"),
"math element should have MathML namespace"
);
}
#[test]
fn svg_element_has_svg_namespace() {
let tree = html::parse("<html><body><svg></svg></body></html>").unwrap();
let svg_element = tree
.iter()
.find_map(|node| match node {
XpathItemTreeNode::ElementNode(e) if e.name == "svg" => Some(e),
_ => None,
})
.expect("svg element should be present");
assert_eq!(
svg_element.namespace.as_deref(),
Some("http://www.w3.org/2000/svg"),
"svg element should have SVG namespace"
);
}
#[test]
fn html_element_has_no_namespace() {
let tree = html::parse("<html><body><p>text</p></body></html>").unwrap();
let p_element = tree
.iter()
.find_map(|node| match node {
XpathItemTreeNode::ElementNode(e) if e.name == "p" => Some(e),
_ => None,
})
.expect("p element should be present");
assert_eq!(
p_element.namespace, None,
"HTML elements should have no namespace (None)"
);
}
#[test]
fn math_definitionurl_attribute_is_adjusted() {
let text = r#"<html><body><math definitionurl="http://example.com"></math></body></html>"#;
let tree = html::parse(text).unwrap();
let output = tree.to_string();
assert!(
output.contains(r#"definitionURL="http://example.com""#),
"definitionurl should be adjusted to definitionURL: {output:?}"
);
assert!(
!output.contains(r#"definitionurl="#),
"lowercased definitionurl should not remain: {output:?}"
);
}
#[test]
fn svg_viewbox_attribute_is_adjusted_to_camel_case() {
let text = r#"<html><body><svg viewBox="0 0 100 100"></svg></body></html>"#;
let tree = html::parse(text).unwrap();
let output = tree.to_string();
assert!(
output.contains(r#"viewBox="0 0 100 100""#),
"viewBox should preserve correct casing: {output:?}"
);
}
#[test]
fn svg_multiple_attributes_are_adjusted() {
let text = r#"<html><body><svg viewBox="0 0 10 10" preserveAspectRatio="xMidYMid"></svg></body></html>"#;
let tree = html::parse(text).unwrap();
let output = tree.to_string();
assert!(
output.contains(r#"viewBox="0 0 10 10""#),
"viewBox should be correctly cased: {output:?}"
);
assert!(
output.contains(r#"preserveAspectRatio="xMidYMid""#),
"preserveAspectRatio should be correctly cased: {output:?}"
);
}
#[test]
fn math_self_closing_with_definitionurl() {
let text =
r#"<html><body><math definitionurl="http://example.com"/><p>after</p></body></html>"#;
let tree = html::parse(text).unwrap();
let output = tree.to_string();
assert!(
output.contains(r#"definitionURL="http://example.com""#),
"definitionURL should be adjusted on self-closing math: {output:?}"
);
assert!(
output.contains("<p>after</p>"),
"content after self-closing math should be preserved: {output:?}"
);
}
#[test]
fn svg_regular_attributes_stay_lowercase() {
let text = r#"<html><body><svg width="100" height="100" fill="none"></svg></body></html>"#;
let tree = html::parse(text).unwrap();
let output = tree.to_string();
assert!(
output.contains(r#"width="100""#),
"width should remain lowercase: {output:?}"
);
assert!(
output.contains(r#"height="100""#),
"height should remain lowercase: {output:?}"
);
assert!(
output.contains(r#"fill="none""#),
"fill should remain lowercase: {output:?}"
);
}
#[test]
fn second_body_tag_merges_attributes_not_after_body() {
let text = r#"<html><body class="a"><body id="extra"><p>content</p></body></html>"#;
let document = html::parse(text).unwrap();
let output = document.to_string();
assert!(
output.contains("<p>content</p>"),
"Content after second <body> tag should be preserved in body: {output:?}"
);
assert!(
output.contains(r#"class="a""#),
"Original body attributes should be preserved: {output:?}"
);
assert!(
output.contains(r#"id="extra""#),
"New attributes from second body tag should be merged: {output:?}"
);
}
#[test]
fn active_formatting_elements_marker_is_cleared() {
let text = r#"<html><body><table><tr><td><b>bold</b></td><td>normal</td></tr></table><p>after</p></body></html>"#;
let document = html::parse(text).unwrap();
let output = document.to_string();
assert!(
!output.contains("<b>after</b>"),
"Formatting should not leak past table boundary: {output:?}"
);
assert!(
output.contains("<p>after</p>"),
"Paragraph after table should be normal: {output:?}"
);
}
#[test]
fn deeply_nested_elements_do_not_stack_overflow() {
let mut html = String::from("<html><body>");
for _ in 0..500 {
html.push_str("<div>");
}
html.push_str("content");
for _ in 0..500 {
html.push_str("</div>");
}
html.push_str("</body></html>");
let document = html::parse(&html).unwrap();
let output = document.to_string();
assert!(
output.contains("content"),
"Deeply nested content should be preserved: {}",
&output[..output.len().min(200)]
);
}