use skyscraper::html::{self, grammar::HtmlParser, QuirksMode};
use skyscraper::xpath;
use skyscraper::xpath::grammar::data_model::AnyAtomicType;
#[test]
fn cdata_in_html_content_does_not_crash() {
let text = "<html><body><![CDATA[some data]]></body></html>";
let result = html::parse(text);
assert!(result.is_ok(), "CDATA in HTML content should not crash");
}
#[test]
fn cdata_in_html_preserves_markup_structure() {
let text = "<html><body><div>before</div><![CDATA[data]]><div>after</div></body></html>";
let document = html::parse(text).unwrap();
let xp = xpath::parse("count(//div)").unwrap();
let result = xp.apply(&document).unwrap();
let count = result[0].extract_as_any_atomic_type();
match count {
AnyAtomicType::Integer(n) => assert_eq!(
*n, 2,
"Both divs should be present after CDATA handling"
),
other => panic!("Expected integer count, got: {other:?}"),
}
}
#[test]
fn doctype_immediate_greater_than_sets_quirks() {
let text = "<!DOCTYPE><html><head></head><body></body></html>";
let document = html::parse(text).unwrap();
assert_eq!(
document.quirks_mode(),
QuirksMode::Quirks,
"<!DOCTYPE> with no name should trigger quirks mode"
);
}
#[test]
fn doctype_trailing_chars_after_system_id_no_quirks() {
let text = r#"<!DOCTYPE html SYSTEM "about:legacy-compat" x><html><head></head><body></body></html>"#;
let document = html::parse(text).unwrap();
assert_eq!(
document.quirks_mode(),
QuirksMode::NoQuirks,
"Trailing chars after system identifier should not force quirks"
);
}
#[test]
fn script_escaped_end_tag_no_spurious_slash() {
let text = "<html><body><script><!--x</script></body></html>";
let document = html::parse(text).unwrap();
let output = document.to_string();
assert!(
!output.contains("/</script>"),
"No spurious '/' should appear before the end tag: {output:?}"
);
}
#[test]
fn template_eof_does_not_crash() {
let text = "<html><body><template><div>inside</div></body></html>";
let result = html::parse(text);
assert!(result.is_ok(), "Unclosed template at EOF should not crash");
}
#[test]
fn template_eof_template_content_preserved() {
let text = "<html><body><template><p>hello";
let result = html::parse(text);
assert!(
result.is_ok(),
"Template with EOF should parse without error"
);
}
#[test]
fn end_body_ignored_when_not_in_scope() {
let text = "<html><head></head><body><table></body><tr><td>cell</td></tr></table></body></html>";
let document = html::parse(text).unwrap();
let output = document.to_string();
assert!(
output.contains("cell"),
"Table content should be preserved when </body> is ignored: {output:?}"
);
}
#[test]
fn adoption_agency_stops_at_marker() {
let text = r#"<html><body><b><table><tr><td></td></tr></table></b></body></html>"#;
let result = html::parse(text);
assert!(
result.is_ok(),
"Adoption agency with markers should not crash"
);
}
#[test]
fn adoption_agency_marker_boundary_produces_correct_tree() {
let text = r#"<div><b>bold<table><tr><td>cell</td></tr></table>more</b></div>"#;
let document = html::parse(text).unwrap();
let output = document.to_string();
assert!(output.contains("bold"), "bold text should be present: {output:?}");
assert!(output.contains("cell"), "cell text should be present: {output:?}");
assert!(output.contains("more"), "more text should be present: {output:?}");
}
#[test]
fn input_type_hidden_preserves_frameset_ok() {
let text = r#"<html><head></head><body><input type="hidden"><input type="text"></body></html>"#;
let document = html::parse(text).unwrap();
let output = document.to_string();
assert!(
output.contains("input"),
"Both inputs should be in the document: {output:?}"
);
}
#[test]
fn unescape_surrogate_codepoint_does_not_panic() {
let result = html::unescape_characters("�");
assert!(
result.contains('\u{FFFD}'),
"Surrogate codepoint should be replaced with U+FFFD: {result:?}"
);
}
#[test]
fn unescape_valid_numeric_reference() {
let result = html::unescape_characters("A");
assert_eq!(result, "A", "A should produce 'A'");
}
#[test]
fn default_parser_handles_parse_errors_gracefully() {
let text = "<html><body><p>unclosed paragraph<p>second paragraph<div>in div</div></body></html>";
let result = html::parse(text);
assert!(
result.is_ok(),
"Parser should swallow parse errors by default: {:?}",
result.err()
);
}
#[test]
fn default_parser_handles_misnested_formatting() {
let text = "<html><body><b><i>bold-italic</b>italic-only</i></body></html>";
let document = html::parse(text).unwrap();
let output = document.to_string();
assert!(
output.contains("bold-italic"),
"Content should be preserved: {output:?}"
);
assert!(
output.contains("italic-only"),
"Content should be preserved: {output:?}"
);
}
#[test]
fn noahs_ark_handles_many_identical_elements() {
let text = "<html><body><b>1<b>2<b>3<b>4</b></b></b></b></body></html>";
let document = html::parse(text).unwrap();
let xp = xpath::parse("count(//b)").unwrap();
let result = xp.apply(&document).unwrap();
let count = result[0].extract_as_any_atomic_type();
match count {
AnyAtomicType::Integer(n) => assert!(
*n <= 4,
"Noah's Ark should limit formatting elements: got {n}"
),
other => panic!("Expected integer count, got: {other:?}"),
}
}
#[test]
fn head_as_last_element_uses_in_body_not_in_head() {
let text = "<head><title>test</title></head><body><p>content</p></body>";
let result = html::parse(text);
assert!(
result.is_ok(),
"Parsing with head element should not crash: {:?}",
result.err()
);
}
#[test]
fn end_tag_mismatch_does_not_panic() {
let text = "<html><body><p>text</p></div></body></html>";
let result = html::parse(text);
assert!(
result.is_ok(),
"Mismatched end tag should not panic: {:?}",
result.err()
);
}
#[test]
fn end_li_mismatch_does_not_panic() {
let text = "<html><body><ul><li>item<p>nested</p></li></ul></body></html>";
let result = html::parse(text);
assert!(
result.is_ok(),
"li end tag handling should not panic: {:?}",
result.err()
);
}
#[test]
fn end_dd_dt_mismatch_does_not_panic() {
let text = "<html><body><dl><dt>term<dd>def<p>nested</p></dd></dt></dl></body></html>";
let result = html::parse(text);
assert!(
result.is_ok(),
"dd/dt end tag handling should not panic: {:?}",
result.err()
);
}
#[test]
fn heading_end_tag_mismatch_does_not_panic() {
let text = "<html><body><h1><span>text</span></h1></body></html>";
let result = html::parse(text);
assert!(
result.is_ok(),
"Heading end tag handling should not panic: {:?}",
result.err()
);
}
#[test]
fn ruby_rt_rp_mismatch_does_not_panic() {
let text = "<html><body><ruby>base<rb>base2<rt>annotation<rp>(</rp>alt<rp>)</rp></rt></rb></ruby></body></html>";
let result = html::parse(text);
assert!(
result.is_ok(),
"ruby/rt/rp handling should not panic: {:?}",
result.err()
);
}
#[test]
fn unescape_no_double_unescape() {
let result = html::unescape_characters("&lt;");
assert_eq!(
result, "<",
"&lt; should become <, not be double-unescaped to <"
);
}
#[test]
fn unescape_amp_gt_no_double_unescape() {
let result = html::unescape_characters("&gt;");
assert_eq!(
result, ">",
"&gt; should become >, not >"
);
}
#[test]
fn unescape_amp_amp_no_double_unescape() {
let result = html::unescape_characters("&amp;");
assert_eq!(
result, "&",
"&amp; should become &, not &"
);
}
#[test]
fn unescape_basic_entities_still_work() {
let result = html::unescape_characters("<>&"");
assert_eq!(result, r#"<>&""#, "Basic entity unescaping should work");
}
#[test]
#[test]
fn adoption_agency_overlapping_formatting_no_double_push() {
let text = "<html><body><b><i>bold-italic</b>just-italic</i>normal</body></html>";
let document = html::parse(text).unwrap();
let xp_bold_italic = xpath::parse("//b/i").unwrap();
let result = xp_bold_italic.apply(&document).unwrap();
assert!(
!result.is_empty(),
"Adoption agency should produce <b><i> nesting"
);
let xp_all_text = xpath::parse("//body//text()").unwrap();
let all_text = xp_all_text.apply(&document).unwrap();
let text_content: String = all_text
.iter()
.filter_map(|item| item.extract_as_node().text(&document))
.collect();
assert!(
text_content.contains("bold-italic"),
"bold-italic text should be present: {text_content}"
);
assert!(
text_content.contains("just-italic"),
"just-italic text should be present: {text_content}"
);
assert!(
text_content.contains("normal"),
"normal text should be present: {text_content}"
);
}
#[test]
fn adoption_agency_triple_overlap_no_corruption() {
let text = "<html><body><a href='#'><b><em>text</a>after</em></b></body></html>";
let document = html::parse(text).unwrap();
let xp = xpath::parse("//body//text()").unwrap();
let all_text = xp.apply(&document).unwrap();
let text_content: String = all_text
.iter()
.filter_map(|item| item.extract_as_node().text(&document))
.collect();
assert!(
text_content.contains("text"),
"text should be present: {text_content}"
);
assert!(
text_content.contains("after"),
"after should be present: {text_content}"
);
}
#[test]
fn script_double_escaped_preserves_less_than() {
let text = "<html><body><script><!--<script>var x = 1 < 2;</script>--></script></body></html>";
let document = html::parse(text).unwrap();
let output = document.to_string();
assert!(
output.contains('<'),
"Less-than signs in double-escaped script should be preserved: {output:?}"
);
}
#[test]
fn adoption_agency_step_15_foster_parenting() {
let text = "<html><body><table><b><tr><td>cell</td></tr></b></table></body></html>";
let document = html::parse(text).unwrap();
let xp = xpath::parse("//td").unwrap();
let result = xp.apply(&document).unwrap();
assert!(
!result.is_empty(),
"Table cell should be present in the document"
);
}
#[test]
fn svg_foreign_object_is_scope_barrier() {
let text = "<html><body><svg><foreignObject><p>html content</p></foreignObject></svg></body></html>";
let document = html::parse(text).unwrap();
let xp = xpath::parse("//p").unwrap();
let result = xp.apply(&document).unwrap();
assert!(
!result.is_empty(),
"<p> inside <foreignObject> should be found"
);
}
#[test]
fn noahs_ark_attribute_comparison_order_independent() {
let text = r#"<html><body>
<b class="x" id="a">1</b>
<b id="a" class="x">2</b>
<b class="x" id="a">3</b>
<b id="a" class="x">4
</body></html>"#;
let document = html::parse(text).unwrap();
let output = document.to_string();
assert!(output.contains('1'), "Text 1 should be present: {output:?}");
assert!(output.contains('4'), "Text 4 should be present: {output:?}");
}
#[test]
fn deeply_nested_document_display_no_overflow() {
let mut text = String::new();
let depth = 260;
for _ in 0..depth {
text.push_str("<div>");
}
text.push_str("deep");
for _ in 0..depth {
text.push_str("</div>");
}
let full_html = format!("<html><body>{}</body></html>", text);
let document = html::parse(&full_html).unwrap();
let output = document.to_string();
assert!(
output.contains("deep"),
"Deeply nested content should be preserved"
);
let xp = xpath::parse("count(//div)").unwrap();
let result = xp.apply(&document).unwrap();
let count = result[0].extract_as_any_atomic_type();
match count {
AnyAtomicType::Integer(n) => assert_eq!(
*n, depth,
"All {depth} nested divs should be present"
),
other => panic!("Expected integer count, got: {other:?}"),
}
}
#[test]
fn truncated_input_gets_eof_finalization() {
let document = html::parse("<div>hello").unwrap();
let xp = xpath::parse("//div").unwrap();
let result = xp.apply(&document).unwrap();
assert!(
!result.is_empty(),
"Truncated input should still produce a valid tree with the div element"
);
}
#[test]
fn empty_input_creates_implied_elements() {
let document = html::parse("").unwrap();
let output = document.to_string();
assert_eq!(
output, "<html><head></head><body></body></html>",
"Empty document should have implied html/head/body per WHATWG spec"
);
}
#[test]
fn unescape_hex_character_references() {
let result = html::unescape_characters("'hello'");
assert_eq!(result, "'hello'", "Hex char ref ' should produce apostrophe");
}
#[test]
fn unescape_mixed_hex_and_decimal() {
let result = html::unescape_characters("ABC");
assert_eq!(result, "ABC", "Mixed hex (A, C) and decimal (B) should all work");
}
#[test]
fn unescape_uppercase_hex() {
let result = html::unescape_characters("’");
assert_eq!(result, "\u{2019}", "Uppercase hex ’ should produce right single quotation mark");
}
#[test]
fn adoption_agency_nested_formatting_elements() {
let text = "<html><body><b>1<i>2<b>3</b>4</i>5</b></body></html>";
let document = html::parse(text).unwrap();
let xp = xpath::parse("//b").unwrap();
let result = xp.apply(&document).unwrap();
assert!(
!result.is_empty(),
"Nested formatting elements should be parsed without errors"
);
}
#[test]
fn adoption_agency_deeply_nested_same_tag() {
let text = "<html><body><b><b><b>text</b></b></b></body></html>";
let document = html::parse(text).unwrap();
let xp = xpath::parse("string(//body)").unwrap();
let result = xp.apply(&document).unwrap();
assert_eq!(
result[0],
skyscraper::xpath::grammar::data_model::XpathItem::AnyAtomicType(
skyscraper::xpath::grammar::data_model::AnyAtomicType::String("text".to_string())
),
"Deeply nested same formatting tags should preserve text content"
);
}
#[test]
fn search_end_tag_handled_correctly() {
let text = "<html><body><search><p>content</p></search></body></html>";
let document = html::parse(text).unwrap();
let xp = xpath::parse("count(//search)").unwrap();
let result = xp.apply(&document).unwrap();
match &result[0] {
skyscraper::xpath::grammar::data_model::XpathItem::AnyAtomicType(
skyscraper::xpath::grammar::data_model::AnyAtomicType::Integer(n),
) => assert_eq!(*n, 1, "There should be exactly one <search> element"),
other => panic!("Expected integer count, got: {:?}", other),
}
}
#[test]
fn search_element_contains_children() {
let text = "<html><body><search><div>inner</div></search><p>after</p></body></html>";
let document = html::parse(text).unwrap();
let xp = xpath::parse("count(//search/div)").unwrap();
let result = xp.apply(&document).unwrap();
match &result[0] {
skyscraper::xpath::grammar::data_model::XpathItem::AnyAtomicType(
skyscraper::xpath::grammar::data_model::AnyAtomicType::Integer(n),
) => assert_eq!(
*n, 1,
"The <div> should be a child of <search>, not a sibling"
),
other => panic!("Expected integer count, got: {:?}", other),
}
}
#[test]
fn close_p_element_pops_stack_with_inline_elements() {
let text = "<html><body><p><b>bold text</b></p><p>next</p></body></html>";
let document = html::parse(text).unwrap();
let xp = xpath::parse("//p[1]/b").unwrap();
let result = xp.apply(&document).unwrap();
assert_eq!(result.len(), 1, "The <b> should be a child of the first <p>");
let xp2 = xpath::parse("count(//body/p)").unwrap();
let result2 = xp2.apply(&document).unwrap();
match &result2[0] {
skyscraper::xpath::grammar::data_model::XpathItem::AnyAtomicType(
skyscraper::xpath::grammar::data_model::AnyAtomicType::Integer(n),
) => assert_eq!(*n, 2, "There should be 2 <p> elements as direct children of <body>"),
other => panic!("Expected integer count, got: {:?}", other),
}
}
#[test]
fn close_p_element_with_nested_inline() {
let text = "<html><body><p><em><strong>text</strong></em></p></body></html>";
let document = html::parse(text).unwrap();
let xp = xpath::parse("//p/em/strong").unwrap();
let result = xp.apply(&document).unwrap();
assert_eq!(
result.len(),
1,
"<strong> should be nested inside <em> inside <p>"
);
}
#[test]
fn cdata_in_html_namespace_treated_as_bogus_comment() {
let text = "<html><body><div>before</div><![CDATA[data]]><div>after</div></body></html>";
let document = html::parse(text).unwrap();
let xp = xpath::parse("//div").unwrap();
let result = xp.apply(&document).unwrap();
assert_eq!(result.len(), 2, "both divs should survive CDATA bogus comment");
let xp_before = xpath::parse("//div[1]/text()").unwrap();
let before = xp_before.apply(&document).unwrap();
assert!(!before.is_empty(), "first div should have text content");
}
#[test]
fn whitespace_before_doctype_preserved() {
let text = " <!DOCTYPE html><html><head></head><body>hello</body></html>";
let document = html::parse(text).unwrap();
let xp = xpath::parse("//body/text()").unwrap();
let result = xp.apply(&document).unwrap();
assert!(!result.is_empty(), "body text should be present");
}
#[test]
fn whitespace_between_head_tags_preserved() {
let text = "<html><head> <title>test</title> </head><body>ok</body></html>";
let document = html::parse(text).unwrap();
let xp = xpath::parse("//title/text()").unwrap();
let result = xp.apply(&document).unwrap();
assert!(!result.is_empty(), "title text should be present");
}
#[test]
fn whitespace_after_head_before_body() {
let text = "<html><head></head> <body><p>content</p></body></html>";
let document = html::parse(text).unwrap();
let xp = xpath::parse("//p/text()").unwrap();
let result = xp.apply(&document).unwrap();
assert!(!result.is_empty(), "paragraph content should be present");
}
#[test]
fn malformed_html_does_not_panic() {
let inputs = [
"<html><body></body></html></html></html>",
"<html><head></head></head><body></body></html>",
"<!DOCTYPE html><html>",
"<noscript></noscript></noscript>",
];
for input in &inputs {
let result = html::parse(input);
assert!(
result.is_ok(),
"parser should handle malformed HTML without panicking: {}",
input
);
}
}
#[test]
fn doctype_public_identifier_double_quoted_abrupt_gt() {
let text = r#"<!DOCTYPE html PUBLIC "foo>rest of document"#;
let document = html::parse(text).unwrap();
assert_eq!(
document.quirks_mode(),
QuirksMode::Quirks,
"abrupt `>` in double-quoted public identifier should trigger quirks mode"
);
}
#[test]
fn doctype_public_identifier_single_quoted_abrupt_gt() {
let text = "<!DOCTYPE html PUBLIC 'foo>rest of document";
let document = html::parse(text).unwrap();
assert_eq!(
document.quirks_mode(),
QuirksMode::Quirks,
"abrupt `>` in single-quoted public identifier should trigger quirks mode"
);
}
#[test]
fn doctype_system_identifier_double_quoted_abrupt_gt() {
let text = r#"<!DOCTYPE html SYSTEM "foo>rest of document"#;
let document = html::parse(text).unwrap();
assert_eq!(
document.quirks_mode(),
QuirksMode::Quirks,
"abrupt `>` in double-quoted system identifier should trigger quirks mode"
);
}
#[test]
fn doctype_system_identifier_single_quoted_abrupt_gt() {
let text = "<!DOCTYPE html SYSTEM 'foo>rest of document";
let document = html::parse(text).unwrap();
assert_eq!(
document.quirks_mode(),
QuirksMode::Quirks,
"abrupt `>` in single-quoted system identifier should trigger quirks mode"
);
}
#[test]
fn doctype_public_identifier_abrupt_gt_content_after_gt_is_parsed() {
let text = r#"<!DOCTYPE html PUBLIC "foo><html><body><p>hello</p></body></html>"#;
let document = html::parse(text).unwrap();
let xp = xpath::parse("//p/text()").unwrap();
let result = xp.apply(&document).unwrap();
assert_eq!(result.len(), 1, "content after abrupt `>` should be parsed as HTML");
}
#[test]
fn ambiguous_ampersand_unicode_not_consumed() {
let text = "<html><body>&é</body></html>";
let document = html::parse(text).unwrap();
let xp = xpath::parse("/html/body/text()").unwrap();
let result = xp.apply(&document).unwrap();
assert!(!result.is_empty());
let text_content = result[0].extract_as_node().extract_as_text_node().content.clone();
assert!(
text_content.contains('é'),
"Unicode letter after & should be preserved, got: {}",
text_content
);
}
#[test]
fn html_parser_reuse_produces_correct_results() {
let mut parser = HtmlParser::new();
let doc1 = parser.parse("<html><body><p>first</p></body></html>").unwrap();
let xp = xpath::parse("//p/text()").unwrap();
let result1 = xp.apply(&doc1).unwrap();
assert_eq!(result1.len(), 1);
let text1 = result1[0].extract_as_node().extract_as_text_node().content.clone();
assert_eq!(text1, "first");
let doc2 = parser.parse("<html><body><div>second</div></body></html>").unwrap();
let xp2 = xpath::parse("//div/text()").unwrap();
let result2 = xp2.apply(&doc2).unwrap();
assert_eq!(result2.len(), 1);
let text2 = result2[0].extract_as_node().extract_as_text_node().content.clone();
assert_eq!(text2, "second");
let xp3 = xpath::parse("//p").unwrap();
let result3 = xp3.apply(&doc2).unwrap();
assert_eq!(result3.len(), 0, "second parse should not contain elements from first parse");
}
#[test]
fn script_double_escape_start_buffer_comparison() {
let text = "<html><head><script><!--\nvar x = 1;\n--></script></head><body></body></html>";
let document = html::parse(text).unwrap();
let xp = xpath::parse("//script/text()").unwrap();
let result = xp.apply(&document).unwrap();
assert_eq!(result.len(), 1);
let script_text = result[0].extract_as_node().extract_as_text_node().content.clone();
assert!(
script_text.contains("var x = 1;"),
"Script content should be preserved through escaped states. Got: {script_text:?}"
);
}
#[test]
fn script_double_escape_end_buffer_comparison() {
let text = "<html><head><script>var y = 2;</script></head><body>after</body></html>";
let document = html::parse(text).unwrap();
let output = document.to_string();
assert!(
output.contains("after"),
"Content after script close should be parsed. Got: {output:?}"
);
let xp = xpath::parse("//script/text()").unwrap();
let result = xp.apply(&document).unwrap();
assert_eq!(result.len(), 1);
let script_text = result[0].extract_as_node().extract_as_text_node().content.clone();
assert_eq!(script_text, "var y = 2;");
}
#[test]
fn unescape_amp_numeric_ref_no_double_unescape() {
assert_eq!(
html::unescape_characters("&#60;"),
"<",
"&#60; should become literal <, not <"
);
assert_eq!(
html::unescape_characters("&#x3C;"),
"<",
"&#x3C; should become literal <, not <"
);
assert_eq!(
html::unescape_characters("<"),
"<",
"< should still resolve to <"
);
assert_eq!(
html::unescape_characters("&lt;"),
"<",
"&lt; should become <"
);
}
#[test]
fn parse_fragment_reuse_does_not_leak_state() {
let mut parser = HtmlParser::new();
let doc1 = parser
.parse("<html><body><p>first</p></body></html>")
.unwrap();
let xp = xpath::parse("//p/text()").unwrap();
let result1 = xp.apply(&doc1).unwrap();
assert_eq!(result1.len(), 1);
let doc2 = parser
.parse_fragment("body", "<div>fragment</div>")
.unwrap();
let xp2 = xpath::parse("//div/text()").unwrap();
let result2 = xp2.apply(&doc2).unwrap();
assert_eq!(
result2.len(),
1,
"Fragment should contain div text, not stale state from previous parse"
);
let text2 = result2[0]
.extract_as_node()
.extract_as_text_node()
.content
.clone();
assert_eq!(text2, "fragment");
let xp3 = xpath::parse("//p").unwrap();
let result3 = xp3.apply(&doc2).unwrap();
assert_eq!(
result3.len(),
0,
"Fragment should not contain <p> from previous parse"
);
}
#[test]
fn li_element_parsing_correctness() {
let text = "<html><body><ul><li>one<li>two<li>three</ul></body></html>";
let document = html::parse(text).unwrap();
let xp = xpath::parse("count(//li)").unwrap();
let result = xp.apply(&document).unwrap();
match &result[0] {
skyscraper::xpath::grammar::data_model::XpathItem::AnyAtomicType(
skyscraper::xpath::grammar::data_model::AnyAtomicType::Integer(n),
) => assert_eq!(*n, 3, "There should be 3 <li> elements"),
other => panic!("Expected integer count, got: {:?}", other),
}
}
#[test]
fn dd_dt_element_parsing_correctness() {
let text = "<html><body><dl><dt>term1<dd>def1<dt>term2<dd>def2</dl></body></html>";
let document = html::parse(text).unwrap();
let xp_dt = xpath::parse("count(//dt)").unwrap();
let result_dt = xp_dt.apply(&document).unwrap();
match &result_dt[0] {
skyscraper::xpath::grammar::data_model::XpathItem::AnyAtomicType(
skyscraper::xpath::grammar::data_model::AnyAtomicType::Integer(n),
) => assert_eq!(*n, 2, "There should be 2 <dt> elements"),
other => panic!("Expected integer count, got: {:?}", other),
}
let xp_dd = xpath::parse("count(//dd)").unwrap();
let result_dd = xp_dd.apply(&document).unwrap();
match &result_dd[0] {
skyscraper::xpath::grammar::data_model::XpathItem::AnyAtomicType(
skyscraper::xpath::grammar::data_model::AnyAtomicType::Integer(n),
) => assert_eq!(*n, 2, "There should be 2 <dd> elements"),
other => panic!("Expected integer count, got: {:?}", other),
}
}
#[test]
fn li_nested_in_special_element_does_not_panic() {
let text = "<html><body><ol><li><div><li>nested</div></ol></body></html>";
let result = html::parse(text);
assert!(
result.is_ok(),
"li nested in special element should not panic: {:?}",
result.err()
);
}
#[test]
fn form_end_tag_processing() {
let text = "<html><body><form><input><p>inside form</p></form><p>after form</p></body></html>";
let document = html::parse(text).unwrap();
let xp = xpath::parse("count(//p)").unwrap();
let result = xp.apply(&document).unwrap();
match &result[0] {
skyscraper::xpath::grammar::data_model::XpathItem::AnyAtomicType(
skyscraper::xpath::grammar::data_model::AnyAtomicType::Integer(n),
) => assert_eq!(*n, 2, "There should be 2 <p> elements"),
other => panic!("Expected integer count, got: {:?}", other),
}
}
#[test]
fn form_end_tag_without_template() {
let text = "<html><body><form>content</form></body></html>";
let document = html::parse(text).unwrap();
let xp = xpath::parse("//form").unwrap();
let result = xp.apply(&document).unwrap();
assert_eq!(result.len(), 1, "Form element should be in the tree");
}
#[test]
fn adoption_agency_no_underflow_on_formatting_at_stack_bottom() {
let text = "<html><body><b><i><u>text</b></u></i></body></html>";
let result = html::parse(text);
assert!(
result.is_ok(),
"Adoption agency should not underflow: {:?}",
result.err()
);
let document = result.unwrap();
let xp = xpath::parse("//body//text()").unwrap();
let all_text = xp.apply(&document).unwrap();
let text_content: String = all_text
.iter()
.filter_map(|item| item.extract_as_node().text(&document))
.collect();
assert!(
text_content.contains("text"),
"Text should be preserved: {text_content}"
);
}
#[test]
fn reset_insertion_mode_select_in_table() {
let text = "<html><body><table><tr><td><select><option>A</option></select></td></tr></table></body></html>";
let result = html::parse(text);
assert!(
result.is_ok(),
"Nested select-in-table should parse without error: {:?}",
result.err()
);
let document = result.unwrap();
let xp = xpath::parse("//option").unwrap();
let result = xp.apply(&document).unwrap();
assert_eq!(result.len(), 1, "Should find the option element");
}
#[test]
fn reset_insertion_mode_deeply_nested_elements() {
let mut html = String::from("<html><body>");
for _ in 0..200 {
html.push_str("<div>");
}
html.push_str("<select><option>deep</option></select>");
for _ in 0..200 {
html.push_str("</div>");
}
html.push_str("</body></html>");
let result = html::parse(&html);
assert!(
result.is_ok(),
"Deeply nested elements should not stack overflow: {:?}",
result.err()
);
let document = result.unwrap();
let xp = xpath::parse("//option").unwrap();
let options = xp.apply(&document).unwrap();
assert_eq!(options.len(), 1, "Should find the deeply nested option");
}
#[test]
fn reset_insertion_mode_various_table_elements() {
let text = "<html><body>\
<table><caption>cap</caption>\
<colgroup><col></colgroup>\
<thead><tr><th>H</th></tr></thead>\
<tbody><tr><td>D</td></tr></tbody>\
<tfoot><tr><td>F</td></tr></tfoot>\
</table></body></html>";
let result = html::parse(text);
assert!(
result.is_ok(),
"Table with multiple section elements should parse: {:?}",
result.err()
);
let document = result.unwrap();
let xp = xpath::parse("//td | //th").unwrap();
let cells = xp.apply(&document).unwrap();
assert_eq!(cells.len(), 3, "Should find th, td in tbody, td in tfoot");
}
#[test]
fn named_charref_without_semicolon_no_double_ampersand() {
let text = r#"<html><body><a href="?a=1¬it">link</a></body></html>"#;
let document = html::parse(text).unwrap();
let xp = xpath::parse("//a/@href").unwrap();
let result = xp.apply(&document).unwrap();
assert_eq!(result.len(), 1);
let attr_value = result[0]
.extract_as_node()
.as_attribute_node()
.unwrap()
.value
.clone();
assert!(
!attr_value.contains("&&"),
"Attribute should not have double ampersand, got: {:?}",
attr_value
);
}
#[test]
fn mathml_elements_are_special() {
let text = "<html><body><math><mi>x</mi></math></body></html>";
let result = html::parse(text);
assert!(
result.is_ok(),
"HTML with MathML elements should parse: {:?}",
result.err()
);
}
#[test]
fn scope_checking_is_namespace_aware() {
let text = "<html><body><p>text</p></body></html>";
let document = html::parse(text).unwrap();
let xp = xpath::parse("//p").unwrap();
let result = xp.apply(&document).unwrap();
assert_eq!(
result.len(),
1,
"The <p> element should be findable (not blocked by scope issues)"
);
}
#[test]
fn adoption_agency_inner_loop_removes_from_open_elements() {
let text = "<html><body><b><b><b><b><b><b>text</b></b></b></b></b></b></body></html>";
let result = html::parse(text);
assert!(
result.is_ok(),
"Deeply nested formatting elements should parse without error: {:?}",
result.err()
);
let document = result.unwrap();
let xp = xpath::parse("//b").unwrap();
let bs = xp.apply(&document).unwrap();
assert!(
!bs.is_empty(),
"Should find at least one <b> element after adoption agency"
);
}
#[test]
fn duplicate_body_merges_attributes() {
let text = r#"<html><body><body class="extra">content</body></body></html>"#;
let result = html::parse(text);
assert!(
result.is_ok(),
"Duplicate <body> with attributes should parse: {:?}",
result.err()
);
let document = result.unwrap();
let xp = xpath::parse("//body/@class").unwrap();
let result = xp.apply(&document).unwrap();
assert_eq!(
result.len(),
1,
"body should have the 'class' attribute merged from the second body tag"
);
}
#[test]
fn document_builder_nested_elements_correct_tree() {
use skyscraper::html::grammar::document_builder::DocumentBuilder;
let tree = DocumentBuilder::new()
.add_element("html", |html| {
html.add_element("body", |body| {
body.add_element("div", |div| div.add_text("hello"))
})
})
.build()
.unwrap();
let xp = xpath::parse("//div").unwrap();
let result = xp.apply(&tree).unwrap();
assert_eq!(result.len(), 1, "Should find exactly one div");
let xp2 = xpath::parse("string(//div)").unwrap();
let result2 = xp2.apply(&tree).unwrap();
let text = result2[0]
.extract_as_any_atomic_type()
.to_string();
assert_eq!(text, "hello", "div text content should be 'hello'");
}
#[test]
fn comment_serialization_sanitizes_double_dash() {
use indextree::Arena;
use skyscraper::html::{HtmlNode, HtmlDocument, DocumentNode, DocumentFormatType, HtmlComment};
let mut arena = Arena::new();
let comment = HtmlNode::Comment(HtmlComment { value: " evil --> payload ".to_string() });
let comment_id = arena.new_node(comment);
let doc = HtmlDocument::new(arena, DocumentNode::new(comment_id));
let output = doc.to_formatted_string(DocumentFormatType::Standard);
let count = output.matches("-->").count();
assert_eq!(
count, 1,
"Comment with '-->' in value should be sanitized to have exactly one '-->' (the closer), got: {}",
output
);
}
#[test]
fn unescape_failed_numeric_ref_produces_replacement_char() {
let input = "�";
let result = html::unescape_characters(input);
assert_eq!(
result, "\u{FFFD}",
"Failed numeric character reference should produce U+FFFD"
);
}
#[test]
fn unescape_valid_numeric_ref_still_works() {
let input = "A"; let result = html::unescape_characters(input);
assert_eq!(result, "A", "Valid numeric reference A should produce 'A'");
}
#[test]
fn display_node_sorts_attributes() {
use std::collections::HashMap;
use indextree::Arena;
use skyscraper::html::{HtmlTag, HtmlNode, HtmlDocument, DocumentNode, DocumentFormatType};
let mut arena = Arena::new();
let mut attrs = HashMap::new();
attrs.insert("zebra".to_string(), "z".to_string());
attrs.insert("alpha".to_string(), "a".to_string());
let tag = HtmlTag { name: "div".to_string(), attributes: attrs };
let tag_id = arena.new_node(HtmlNode::Tag(tag));
let text_node = HtmlNode::Text(skyscraper::html::HtmlText::new("x"));
let text_id = arena.new_node(text_node);
tag_id.append(text_id, &mut arena);
let doc = HtmlDocument::new(arena, DocumentNode::new(tag_id));
let display = doc.to_formatted_string(DocumentFormatType::Standard);
let alpha_pos = display.find("alpha");
let zebra_pos = display.find("zebra");
if let (Some(a), Some(z)) = (alpha_pos, zebra_pos) {
assert!(
a < z,
"alpha should appear before zebra in sorted output, got: {}",
display
);
}
}
#[test]
fn markup_declaration_comments_parse_correctly() {
let text = r#"<html><body><!-- comment 1 --><!-- comment 2 --><div>text</div><!-- comment 3 --></body></html>"#;
let doc = html::parse(text).unwrap();
let xp = xpath::parse("//div").unwrap();
let result = xp.apply(&doc).unwrap();
assert_eq!(result.len(), 1, "comments should not affect div parsing");
}
#[test]
fn markup_declaration_doctype_parses_correctly() {
let text = r#"<!DOCTYPE html><html><body><div>text</div></body></html>"#;
let doc = html::parse(text).unwrap();
let xp = xpath::parse("//div").unwrap();
let result = xp.apply(&doc).unwrap();
assert_eq!(result.len(), 1, "DOCTYPE should parse correctly");
}
#[test]
fn markup_declaration_doctype_public_system() {
let text = r#"<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"><html><body><div>ok</div></body></html>"#;
let doc = html::parse(text).unwrap();
let xp = xpath::parse("//div").unwrap();
let result = xp.apply(&doc).unwrap();
assert_eq!(result.len(), 1, "DOCTYPE with PUBLIC/SYSTEM should parse");
}
#[test]
fn end_tag_for_missing_element_does_not_panic() {
let text = r#"<html><body></nonexistent><div>ok</div></body></html>"#;
let result = html::parse(text);
assert!(result.is_ok(), "end tag for missing element should not panic");
}
#[test]
fn extremely_malformed_html_does_not_panic() {
let text = r#"<html><body><li><dd><dt><li></p></div></span></body></html>"#;
let result = html::parse(text);
assert!(result.is_ok(), "malformed HTML should not panic");
}
#[test]
fn empty_document_does_not_panic() {
let text = "";
let result = html::parse(text);
assert!(result.is_ok(), "empty document should not panic");
}
#[test]
fn script_tag_content_not_parsed_as_html() {
let text = r#"<html><body><script>var x = "<div>not a real div</div>";</script><div>real</div></body></html>"#;
let doc = html::parse(text).unwrap();
let xp = xpath::parse("//div").unwrap();
let result = xp.apply(&doc).unwrap();
assert_eq!(
result.len(),
1,
"script content should not be parsed as HTML elements"
);
}
#[test]
fn deeply_nested_divs_do_not_stack_overflow() {
let mut text = String::new();
text.push_str("<html><body>");
for _ in 0..1000 {
text.push_str("<div>");
}
text.push_str("deep text");
for _ in 0..1000 {
text.push_str("</div>");
}
text.push_str("</body></html>");
let doc = html::parse(&text).unwrap();
let xp = xpath::parse("//text()").unwrap();
let result = xp.apply(&doc).unwrap();
let texts: Vec<String> = result
.iter()
.filter_map(|item| {
if let skyscraper::xpath::grammar::XpathItemTreeNode::TextNode(t) = item.extract_as_node() {
Some(t.content.clone())
} else {
None
}
})
.collect();
assert!(
texts.iter().any(|t| t.contains("deep text")),
"deeply nested content should be accessible"
);
}
#[test]
fn svg_title_parses_correctly_in_svg_context() {
let text = r#"<html><body><svg><title>SVG Title</title><rect/></svg><title>HTML Title</title></body></html>"#;
let doc = html::parse(text).unwrap();
let xp = xpath::parse("//title").unwrap();
let result = xp.apply(&doc).unwrap();
assert!(
!result.is_empty(),
"title elements should be found in both HTML and SVG contexts"
);
}
#[test]
fn scope_check_distinguishes_svg_title_from_html_title() {
let text = r#"<html><body><p>text<svg><title>SVG</title></svg></p></body></html>"#;
let doc = html::parse(text).unwrap();
let xp = xpath::parse("//p/svg").unwrap();
let result = xp.apply(&doc).unwrap();
assert!(
!result.is_empty(),
"SVG element should be a child of <p>, not a sibling — SVG <title> should not close <p>"
);
}
#[test]
fn scope_check_html_title_still_works() {
let text = r#"<html><head><title>Test</title></head><body></body></html>"#;
let doc = html::parse(text).unwrap();
let xp = xpath::parse("//head/title").unwrap();
let result = xp.apply(&doc).unwrap();
assert_eq!(result.len(), 1, "HTML <title> should be under <head>");
}
#[test]
fn any_other_end_tag_identity_comparison() {
let text = r#"<html><body><span><span>inner</span></span></body></html>"#;
let doc = html::parse(text).unwrap();
let xp = xpath::parse("//span").unwrap();
let result = xp.apply(&doc).unwrap();
assert_eq!(result.len(), 2, "both nested spans should be in the tree");
let xp = xpath::parse("//span/span").unwrap();
let result = xp.apply(&doc).unwrap();
assert_eq!(result.len(), 1, "inner span should be child of outer span");
}
#[test]
fn character_insertion_produces_correct_text() {
let text = r#"<html><body>Hello & world</body></html>"#;
let doc = html::parse(text).unwrap();
let xp = xpath::parse("//body/text()").unwrap();
let result = xp.apply(&doc).unwrap();
assert!(!result.is_empty(), "body should have text content");
}
#[test]
fn lazy_lock_xpath_static_still_works() {
use std::sync::LazyLock;
use skyscraper::xpath::Xpath;
static TEST_XPATH: LazyLock<Xpath> = LazyLock::new(|| xpath::parse("//div").unwrap());
let text = "<html><body><div>test</div></body></html>";
let doc = html::parse(text).unwrap();
let result = TEST_XPATH.apply(&doc).unwrap();
assert_eq!(result.len(), 1, "LazyLock-based static XPath should work");
}