use std::{
collections::HashMap,
io::Write,
path::PathBuf,
process::{Command, Stdio},
};
use itertools::Itertools;
use serde::Deserialize;
use skyscraper::{
html,
xpath::{self, xpath_item_set::XpathItemSet, XpathItemTree},
};
#[derive(Deserialize, Debug, PartialEq)]
struct LxmlElement {
pub tag: String,
pub text: Option<String>,
pub text_content: String,
pub attrib: HashMap<String, String>,
pub itertext: Vec<String>,
}
fn get_lxml_output(xpath: &str, html_text: String, count_only: bool) -> std::process::Output {
let mut lxml_python_path = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
lxml_python_path.push("tests/lxml_tests/xpath.py");
let mut venv_python = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
venv_python.push("tests/lxml_tests/.venv/bin/python");
let python = if venv_python.exists() {
venv_python.into_os_string().into_string().unwrap()
} else {
"python3".to_string()
};
let mut cmd = Command::new(python);
cmd.stdin(Stdio::piped())
.stdout(Stdio::piped())
.arg(
lxml_python_path
.clone()
.into_os_string()
.into_string()
.unwrap(),
)
.arg(xpath);
if count_only {
cmd.arg("--count-only");
}
let mut process = cmd.spawn().expect("failed to spawn process");
let mut stdin = process.stdin.take().expect("Failed to open stdin");
std::thread::spawn(move || {
stdin
.write_all(html_text.as_bytes())
.expect("Failed to write to stdin");
});
let output = process
.wait_with_output()
.expect("failed to execute stack overflow tests");
output
}
fn get_lxml_elements(xpath: &str, html_text: String) -> Vec<LxmlElement> {
let output = get_lxml_output(xpath, html_text, false);
let stdout = String::from_utf8_lossy(&output.stdout);
assert!(
output.status.success(),
"{}\n{}",
stdout,
String::from_utf8_lossy(&output.stderr)
);
let lxml_elements: Vec<LxmlElement> = serde_json::from_str(&stdout).unwrap();
return lxml_elements;
}
fn skyscraper_to_lxml_elements(
xpath_tree: &XpathItemTree,
item_set: XpathItemSet,
) -> Vec<LxmlElement> {
let mut lxml_elements = Vec::new();
for item in item_set.into_iter() {
let node = item.extract_into_node();
let element = node.extract_as_element_node();
let text = element.text(&xpath_tree);
let text_content = element.text_content(&xpath_tree);
let itertext = element.itertext(&xpath_tree).collect();
lxml_elements.push(LxmlElement {
tag: element.name.to_string(),
text,
text_content,
attrib: element
.attributes(&xpath_tree)
.iter()
.map(|x| (x.name.clone(), x.value.clone()))
.collect(),
itertext,
});
}
return lxml_elements;
}
fn compare_skyscraper_to_lxml(
lxml_elements: Vec<LxmlElement>,
converted_skyscraper_elems: Vec<LxmlElement>,
) {
for (i, eb) in lxml_elements
.iter()
.zip_longest(converted_skyscraper_elems.iter())
.enumerate()
{
let (lxml_elem, skyscraper_elem) = eb.left_and_right();
if let (Some(lxml_elem), Some(skyscraper_elem)) = (lxml_elem, skyscraper_elem) {
assert_eq!(
lxml_elem.tag, skyscraper_elem.tag,
"Tag mismatch at index {}",
i
);
assert_eq!(
lxml_elem.text, skyscraper_elem.text,
"Text mismatch at index {}",
i
);
let lxml_lower: HashMap<String, String> = lxml_elem
.attrib
.iter()
.map(|(k, v)| (k.to_ascii_lowercase(), v.clone()))
.collect();
let sky_lower: HashMap<String, String> = skyscraper_elem
.attrib
.iter()
.map(|(k, v)| (k.to_ascii_lowercase(), v.clone()))
.collect();
assert_eq!(
lxml_lower, sky_lower,
"Attribute mismatch at index {}",
i
);
compare_itertext(&lxml_elem.itertext, &skyscraper_elem.itertext);
} else {
assert_eq!(
lxml_elem, skyscraper_elem,
"Element mismatch at index {}",
i
);
}
}
assert_eq!(converted_skyscraper_elems.len(), lxml_elements.len());
}
fn compare_itertext(first: &Vec<String>, second: &Vec<String>) {
for (i, eb) in first.iter().zip_longest(second.iter()).enumerate() {
let (first, second) = eb.left_and_right();
assert_eq!(first, second, "Itertext mismatch at index {}", i);
}
}
static GITHUB_HTML: &'static str = include_str!("../samples/James-LG_Skyscraper.html");
#[test]
fn test_lxml_output() {
let html_text = GITHUB_HTML.to_string();
let xpath = r#"//a[@rel='author']"#;
let lxml_elements = get_lxml_elements(xpath, html_text);
assert_eq!(lxml_elements.len(), 1);
let mut lxml_elements = lxml_elements.into_iter();
let lxml_element = lxml_elements.next().unwrap();
assert_eq!(lxml_element.tag, "a");
assert_eq!(lxml_element.text, Some("James-LG".to_string()));
assert_eq!(lxml_element.attrib["rel"], "author");
}
#[test]
fn test_text_handling() {
let html_text = GITHUB_HTML.to_string();
let xpath = r#"//div[@role='tabpanel']"#;
let html_document = html::parse(&html_text).unwrap();
let xpath_expr = xpath::parse(xpath).unwrap();
let lxml_elements = get_lxml_elements(xpath, html_text);
let skyscraper_elements = xpath_expr.apply(&html_document).unwrap();
let converted_skyscraper_elems =
skyscraper_to_lxml_elements(&html_document, skyscraper_elements);
compare_skyscraper_to_lxml(lxml_elements, converted_skyscraper_elems);
}
#[test]
fn test_text_handling2() {
let html_text = GITHUB_HTML.to_string();
let xpath = r#"//h2"#;
let html_document = html::parse(&html_text).unwrap();
let xpath_expr = xpath::parse(xpath).unwrap();
let lxml_elements = get_lxml_elements(xpath, html_text);
let skyscraper_elements = xpath_expr.apply(&html_document).unwrap();
let converted_skyscraper_elems =
skyscraper_to_lxml_elements(&html_document, skyscraper_elements);
compare_skyscraper_to_lxml(lxml_elements, converted_skyscraper_elems);
}
#[test]
fn test_text_handling3() {
let html_text = GITHUB_HTML.to_string();
let xpath = r#"//div"#;
let html_document = html::parse(&html_text).unwrap();
let xpath_expr = xpath::parse(xpath).unwrap();
let lxml_elements = get_lxml_elements(xpath, html_text);
let skyscraper_elements = xpath_expr.apply(&html_document).unwrap();
let converted_skyscraper_elems =
skyscraper_to_lxml_elements(&html_document, skyscraper_elements);
compare_skyscraper_to_lxml(lxml_elements, converted_skyscraper_elems);
}
#[test]
fn test_item_count1() {
let html_text = GITHUB_HTML.to_string();
let xpath = "//div[@class='flex-auto min-width-0 width-fit mr-3']";
let html_document = html::parse(&html_text).unwrap();
let xpath_expr = xpath::parse(xpath).unwrap();
let lxml_output = get_lxml_output(xpath, html_text, true);
let skyscraper_elements = xpath_expr.apply(&html_document).unwrap();
let output = String::from_utf8_lossy(&lxml_output.stdout);
let lxml_count = output.trim().parse::<usize>().unwrap();
assert_eq!(lxml_count, skyscraper_elements.len());
}
fn run_lxml_comparison(xpath: &str) {
run_lxml_comparison_with_html(xpath, GITHUB_HTML);
}
fn run_lxml_comparison_with_html(xpath: &str, html_text: &str) {
let html_document = html::parse(html_text).unwrap();
let xpath_expr = xpath::parse(xpath).unwrap();
let lxml_elements = get_lxml_elements(xpath, html_text.to_string());
let skyscraper_elements = xpath_expr.apply(&html_document).unwrap();
let converted_skyscraper_elems =
skyscraper_to_lxml_elements(&html_document, skyscraper_elements);
compare_skyscraper_to_lxml(lxml_elements, converted_skyscraper_elems);
}
fn run_lxml_count_comparison(xpath: &str) {
run_lxml_count_comparison_with_html(xpath, GITHUB_HTML);
}
fn run_lxml_count_comparison_with_html(xpath: &str, html_text: &str) {
let html_document = html::parse(html_text).unwrap();
let xpath_expr = xpath::parse(xpath).unwrap();
let lxml_output = get_lxml_output(xpath, html_text.to_string(), true);
let skyscraper_elements = xpath_expr.apply(&html_document).unwrap();
let output = String::from_utf8_lossy(&lxml_output.stdout);
let lxml_count = output.trim().parse::<usize>().unwrap();
assert!(lxml_count > 0, "lxml returned 0 results for '{}'", xpath);
assert_eq!(lxml_count, skyscraper_elements.len());
}
#[test]
fn test_select_all_spans() {
run_lxml_comparison("//span");
}
#[test]
fn test_select_all_anchors() {
run_lxml_comparison("//a");
}
#[test]
fn test_select_all_li() {
run_lxml_comparison("//li");
}
#[test]
fn test_select_all_p() {
run_lxml_comparison("//p");
}
#[test]
fn test_select_all_h1() {
run_lxml_comparison("//h1");
}
#[test]
fn test_select_all_buttons() {
run_lxml_comparison("//button");
}
#[test]
fn test_select_all_summary() {
run_lxml_comparison("//summary");
}
#[test]
fn test_select_all_img() {
run_lxml_comparison("//img");
}
#[test]
fn test_select_all_meta() {
run_lxml_comparison("//meta");
}
#[test]
fn test_select_all_link() {
run_lxml_comparison("//link");
}
#[test]
fn test_select_all_input() {
run_lxml_comparison("//input");
}
#[test]
fn test_select_all_form() {
run_lxml_comparison("//form");
}
#[test]
fn test_select_all_svg() {
run_lxml_comparison("//svg");
}
#[test]
fn test_child_li_a() {
run_lxml_comparison("//li/a");
}
#[test]
fn test_child_ul_li() {
run_lxml_comparison("//ul/li");
}
#[test]
fn test_child_div_span() {
run_lxml_comparison("//div/span");
}
#[test]
fn test_child_of_specific_div() {
run_lxml_comparison("//div[@class='position-relative']/div");
}
#[test]
fn test_descendant_nav_a() {
run_lxml_comparison("//nav//a");
}
#[test]
fn test_descendant_header_nav_a() {
run_lxml_comparison("//header//nav//a");
}
#[test]
fn test_descendant_details_summary() {
run_lxml_comparison("//details//summary");
}
#[test]
fn test_descendant_specific_div_span() {
run_lxml_comparison("//div[@class='position-relative']//span");
}
#[test]
fn test_parent_axis() {
run_lxml_comparison("//a/..");
}
#[test]
fn test_attr_contains_href() {
run_lxml_comparison("//a[contains(@href, 'github')]");
}
#[test]
fn test_attr_contains_class() {
run_lxml_comparison("//a[contains(@class, 'Link')]");
}
#[test]
fn test_attr_has_class() {
run_lxml_comparison("//span[@class]");
}
#[test]
fn test_attr_has_id() {
run_lxml_comparison("//div[@id]");
}
#[test]
fn test_attr_multiple_existence() {
run_lxml_comparison("//a[@class and @href]");
}
#[test]
fn test_attr_exact_class() {
run_lxml_comparison("//div[@class='position-relative']");
}
#[test]
fn test_attr_data_content() {
run_lxml_comparison("//span[@data-content]");
}
#[test]
fn test_attr_aria_label() {
run_lxml_comparison("//a[@aria-label]");
}
#[test]
fn test_attr_data_analytics() {
run_lxml_comparison("//a[@data-analytics-event]");
}
#[test]
fn test_attr_img_alt() {
run_lxml_comparison("//img[@alt]");
}
#[test]
fn test_attr_img_src() {
run_lxml_comparison("//img[@src]");
}
#[test]
fn test_not_attr() {
run_lxml_comparison("//a[not(@class)]");
}
#[test]
fn test_attr_and_not() {
run_lxml_comparison("//a[@href and not(@class)]");
}
#[test]
fn test_wildcard_with_attr() {
run_lxml_count_comparison("//*[@role]");
}
#[test]
fn test_positional_first() {
run_lxml_comparison("(//a)[1]");
}
#[test]
fn test_positional_last() {
run_lxml_comparison("(//span)[last()]");
}
#[test]
fn test_positional_lte() {
run_lxml_comparison("//li[position() <= 3]");
}
#[test]
fn test_string_length_predicate() {
run_lxml_comparison("//span[string-length(@class) > 20]");
}
#[test]
fn test_element_exists_descendant() {
run_lxml_count_comparison("//div[.//svg]");
}
#[test]
fn test_element_exists_child() {
run_lxml_comparison("//div[p]");
}
#[test]
fn test_element_exists_nested() {
run_lxml_comparison("//ul[li/a]");
}
#[test]
fn test_count_predicate() {
run_lxml_count_comparison("//div[count(a) > 0]");
}
#[test]
fn test_union_a_or_span() {
run_lxml_count_comparison("//a | //span");
}
#[test]
fn test_or_predicate() {
run_lxml_comparison("//a[contains(@class, 'Link') or contains(@class, 'btn')]");
}
#[test]
fn test_or_predicate_child_element() {
run_lxml_comparison("//li[a or span]");
}
#[test]
fn test_not_equal_attr() {
run_lxml_count_comparison("//div[@class != 'position-relative']");
}
#[test]
fn test_ancestor_axis() {
run_lxml_comparison("//a[@rel='author']/ancestor::div");
}
#[test]
fn test_ancestor_axis_from_class() {
run_lxml_comparison("//div[@class='position-relative']/ancestor::div");
}
#[test]
fn test_ancestor_axis_positional() {
run_lxml_comparison("//a[@rel='author']/ancestor::div[1]");
}
#[test]
fn test_ancestor_axis_last() {
run_lxml_comparison("//a[@rel='author']/ancestor::div[last()]");
}
#[test]
fn test_ancestor_wildcard() {
run_lxml_comparison("//a[@rel='author']/ancestor::*[1]");
}
#[test]
fn test_ancestor_or_self_axis() {
run_lxml_comparison("//div[@class='position-relative']/ancestor-or-self::div");
}
#[test]
fn test_self_axis() {
run_lxml_comparison("//a[@rel='author']/self::a");
}
#[test]
fn test_following_sibling() {
run_lxml_comparison("//li/following-sibling::li");
}
#[test]
fn test_following_sibling_predicate() {
run_lxml_count_comparison("//div[@class][following-sibling::div]");
}
#[test]
fn test_preceding_sibling() {
run_lxml_comparison("//li/preceding-sibling::li");
}
#[test]
fn test_preceding_sibling_predicate() {
run_lxml_count_comparison("//div[@class][preceding-sibling::div]");
}
#[test]
fn test_following_axis() {
run_lxml_comparison("//h1/following::h2");
}
#[test]
fn test_preceding_axis() {
run_lxml_comparison("//h2/preceding::h1");
}
#[test]
fn test_chained_predicates() {
run_lxml_count_comparison("//a[@class][contains(@href, 'github')][1]");
}
#[test]
fn test_two_attr_predicates() {
run_lxml_comparison("//div[@class][span]");
}
#[test]
fn test_nested_predicate() {
run_lxml_comparison("//div[div[a]]");
}
#[test]
fn test_nested_attr_predicate() {
run_lxml_comparison("//li[a[@href]]");
}
#[test]
fn test_nested_attr_predicate2() {
run_lxml_comparison("//div[a[@class]]");
}
#[test]
fn test_compound_negation() {
run_lxml_count_comparison("//a[@href][not(starts-with(@href, '#'))]");
}
#[test]
fn test_positional_range() {
run_lxml_comparison("(//div)[position() >= 3 and position() <= 5]");
}
#[test]
fn test_positional_first_child() {
run_lxml_comparison("//ul/li[1]");
}
#[test]
fn test_positional_last_child() {
run_lxml_comparison("//ul/li[last()]");
}
#[test]
fn test_positional_last_sibling() {
run_lxml_count_comparison("//div[last()]");
}
#[test]
fn test_positional_first_sibling() {
run_lxml_count_comparison("//div[1]");
}
#[test]
fn test_string_length_href() {
run_lxml_comparison("//a[string-length(@href) > 50]");
}
#[test]
fn test_normalize_space_predicate() {
run_lxml_comparison("//a[normalize-space(@class) = 'Link--secondary']");
}
#[test]
fn test_contains_text_content() {
run_lxml_comparison("//a[contains(., 'James')]");
}
#[test]
fn test_contains_dot_text() {
run_lxml_count_comparison("//div[contains(., 'Skyscraper')]");
}
#[test]
fn test_string_length_normalize_space_dot() {
run_lxml_count_comparison("//a[string-length(normalize-space(.)) > 0]");
}
#[test]
fn test_absolute_path() {
run_lxml_comparison("/html/body//div[@id]");
}
#[test]
fn test_absolute_path_direct_children() {
run_lxml_comparison("/html/body/div");
}
#[test]
fn test_parent_then_child() {
run_lxml_comparison("//a[@href]/../span");
}
#[test]
fn test_count_many_children() {
run_lxml_count_comparison("//div[count(div) > 3]");
}
#[test]
fn test_count_wildcard_children() {
run_lxml_count_comparison("//div[count(*) > 10]");
}
static CUSTOM_HTML: &str = r#"<html><body>
<div id="root">
<ul class="list">
<li class="item first">Alpha</li>
<li class="item">Beta</li>
<li class="item last">Gamma</li>
</ul>
<div class="content">
<p>Hello <strong>bold</strong> world</p>
<p class="intro">Second <em>emphasized</em> paragraph</p>
</div>
<!-- Explicit tbody so Skyscraper (WHATWG) and lxml produce the same tree. -->
<table><tbody>
<tr><td class="c1">A1</td><td class="c2">A2</td></tr>
<tr><td class="c1">B1</td><td class="c2">B2</td></tr>
</tbody></table>
<div class="nested">
<div class="inner"><span data-x="1">deep</span></div>
</div>
<div class="siblings">
<span class="a">first</span>
<span class="b">second</span>
<span class="c">third</span>
</div>
</div>
</body></html>"#;
#[test]
fn test_custom_following_sibling() {
run_lxml_comparison_with_html("//li[@class='item first']/following-sibling::li", CUSTOM_HTML);
}
#[test]
fn test_custom_preceding_sibling() {
run_lxml_comparison_with_html("//li[@class='item last']/preceding-sibling::li", CUSTOM_HTML);
}
#[test]
fn test_custom_ancestor() {
run_lxml_comparison_with_html("//strong/ancestor::div", CUSTOM_HTML);
}
#[test]
fn test_custom_ancestor_or_self() {
run_lxml_comparison_with_html("//div[@class='inner']/ancestor-or-self::div", CUSTOM_HTML);
}
#[test]
fn test_custom_following() {
run_lxml_comparison_with_html("//ul/following::div", CUSTOM_HTML);
}
#[test]
fn test_custom_preceding() {
run_lxml_comparison_with_html("//table/preceding::div", CUSTOM_HTML);
}
#[test]
fn test_custom_self_axis() {
run_lxml_comparison_with_html("//p/self::p", CUSTOM_HTML);
}
#[test]
fn test_custom_union() {
run_lxml_count_comparison_with_html("//strong | //em", CUSTOM_HTML);
}
#[test]
fn test_custom_parent_then_sibling() {
run_lxml_comparison_with_html("//strong/..", CUSTOM_HTML);
}
#[test]
fn test_custom_child_descendant_mix() {
run_lxml_comparison_with_html("//div[@id='root']/div//span", CUSTOM_HTML);
}
#[test]
fn test_custom_nested_predicate() {
run_lxml_comparison_with_html("//div[p[strong]]", CUSTOM_HTML);
}
#[test]
fn test_custom_or_predicate() {
run_lxml_comparison_with_html("//li[@class='item first' or @class='item last']", CUSTOM_HTML);
}
#[test]
fn test_custom_contains_dot() {
run_lxml_comparison_with_html("//p[contains(., 'bold')]", CUSTOM_HTML);
}
#[test]
fn test_custom_positional_first_td() {
run_lxml_comparison_with_html("//tr/td[1]", CUSTOM_HTML);
}
#[test]
fn test_custom_positional_last_td() {
run_lxml_comparison_with_html("//tr/td[last()]", CUSTOM_HTML);
}
#[test]
fn test_custom_wildcard_descendants() {
run_lxml_count_comparison_with_html("//div[@class='content']//*", CUSTOM_HTML);
}
#[test]
fn test_custom_multi_step_chain() {
run_lxml_comparison_with_html("//div[@class='nested']//span[@data-x]", CUSTOM_HTML);
}
#[test]
fn test_custom_following_sibling_span() {
run_lxml_comparison_with_html(
"//div[@class='siblings']/span[@class='a']/following-sibling::span",
CUSTOM_HTML,
);
}
#[test]
fn test_custom_not_contains() {
run_lxml_comparison_with_html("//li[not(contains(@class, 'first'))]", CUSTOM_HTML);
}
#[test]
fn test_custom_normalize_space() {
run_lxml_comparison_with_html("//li[normalize-space(.) = 'Beta']", CUSTOM_HTML);
}
#[test]
fn test_select_script_with_type() {
run_lxml_comparison("//script[@type]");
}
#[test]
fn test_select_script_exact_type() {
run_lxml_comparison("//script[@type='application/json']");
}
#[test]
fn test_select_all_label() {
run_lxml_comparison("//label");
}
#[test]
fn test_select_all_header() {
run_lxml_comparison("//header");
}
#[test]
fn test_select_all_footer() {
run_lxml_comparison("//footer");
}
#[test]
fn test_select_all_nav() {
run_lxml_comparison("//nav");
}
#[test]
fn test_select_main() {
run_lxml_comparison("//main");
}
#[test]
fn test_select_article() {
run_lxml_comparison("//article");
}
#[test]
fn test_select_style() {
run_lxml_comparison("//style");
}
#[test]
fn test_substring_predicate() {
run_lxml_count_comparison("//a[substring(@href, 1, 5) = 'https']");
}
#[test]
fn test_translate_predicate() {
run_lxml_comparison("//a[translate(@rel, 'AUTHOR', 'author') = 'author']");
}
#[test]
fn test_string_function_exact() {
run_lxml_comparison("//a[string(.) = 'James-LG']");
}
#[test]
fn test_string_function_truthy() {
run_lxml_count_comparison("//div[string(@class)]");
}
#[test]
fn test_boolean_function() {
run_lxml_count_comparison("//div[boolean(@class)]");
}
#[test]
fn test_leaf_div() {
run_lxml_count_comparison("//div[count(child::*) = 0]");
}
#[test]
fn test_leaf_span() {
run_lxml_count_comparison("//span[not(child::*)]");
}
#[test]
fn test_mod_positional() {
run_lxml_count_comparison("//div[position() mod 2 = 1][1]");
}
#[test]
fn test_starts_with_class() {
run_lxml_comparison("//div[starts-with(@class, 'position')]");
}
#[test]
fn test_following_sibling_wildcard() {
run_lxml_count_comparison("//div[@class]/following-sibling::*[1]");
}
#[test]
fn test_preceding_with_positional() {
run_lxml_comparison("//a[@rel='author']/preceding::a[1]");
}
#[test]
fn test_following_sibling_after_h2() {
run_lxml_count_comparison("//h2[1]/following-sibling::*[1]");
}
#[test]
fn test_string_length_dot() {
run_lxml_count_comparison("//div[string-length(.) > 1000]");
}
static CUSTOM_HTML_DEEP: &str = r#"<html><body>
<div id="a">
<div id="b">
<div id="c">
<span class="deep">found</span>
</div>
</div>
</div>
<div id="flat">
<span class="x">one</span>
<span class="y">two</span>
<span class="z">three</span>
</div>
<ul>
<li>1<ul><li>1.1</li><li>1.2</li></ul></li>
<li>2<ul><li>2.1</li><li>2.2</li></ul></li>
</ul>
</body></html>"#;
#[test]
fn test_custom_deep_ancestor_chain() {
run_lxml_comparison_with_html("//span[@class='deep']/ancestor::div", CUSTOM_HTML_DEEP);
}
#[test]
fn test_custom_nested_list_items() {
run_lxml_comparison_with_html("//ul/li/ul/li", CUSTOM_HTML_DEEP);
}
#[test]
fn test_custom_outer_list_items() {
run_lxml_comparison_with_html("//body/ul/li", CUSTOM_HTML_DEEP);
}
#[test]
fn test_custom_preceding_sibling_positional() {
run_lxml_comparison_with_html(
"//span[@class='z']/preceding-sibling::span[1]",
CUSTOM_HTML_DEEP,
);
}
#[test]
fn test_custom_nested_list_following() {
run_lxml_comparison_with_html("//div[@id='a']/following::div", CUSTOM_HTML_DEEP);
}
#[test]
fn test_custom_deep_absolute_path() {
run_lxml_comparison_with_html("/html/body/div/div/div/span", CUSTOM_HTML_DEEP);
}
#[test]
fn test_explicit_child_axis() {
run_lxml_comparison_with_html("//div[@id='root']/child::ul", CUSTOM_HTML);
}
#[test]
fn test_explicit_descendant_axis() {
run_lxml_comparison_with_html("//div[@id='root']/descendant::span", CUSTOM_HTML);
}
#[test]
fn test_explicit_descendant_or_self_axis() {
run_lxml_comparison_with_html(
"//div[@class='nested']/descendant-or-self::div",
CUSTOM_HTML,
);
}
#[test]
fn test_select_all_td() {
run_lxml_comparison_with_html("//td", CUSTOM_HTML);
}
#[test]
fn test_select_all_tr() {
run_lxml_comparison_with_html("//tr", CUSTOM_HTML);
}
#[test]
fn test_select_all_em() {
run_lxml_comparison_with_html("//em", CUSTOM_HTML);
}
#[test]
fn test_select_all_strong() {
run_lxml_comparison_with_html("//strong", CUSTOM_HTML);
}
#[test]
fn test_positional_last_minus_one() {
run_lxml_comparison_with_html("//ul/li[last() - 1]", CUSTOM_HTML);
}
#[test]
fn test_positional_position_eq_last() {
run_lxml_comparison_with_html("//ul/li[position() = last()]", CUSTOM_HTML);
}
#[test]
fn test_positional_not_last() {
run_lxml_comparison_with_html("//ul/li[position() != last()]", CUSTOM_HTML);
}
#[test]
fn test_count_gt_two() {
run_lxml_count_comparison_with_html("//div[count(*) > 2]", CUSTOM_HTML);
}
#[test]
fn test_count_eq_one() {
run_lxml_comparison_with_html("//div[count(*) = 1]", CUSTOM_HTML);
}
#[test]
fn test_arithmetic_count_plus() {
run_lxml_count_comparison_with_html("//div[count(*) + 1 > 3]", CUSTOM_HTML);
}
#[test]
fn test_arithmetic_string_length_sub() {
run_lxml_comparison_with_html("//li[string-length(@class) - 4 > 0]", CUSTOM_HTML);
}
#[test]
fn test_position_mod_even() {
run_lxml_comparison_with_html("//ul/li[position() mod 2 = 0]", CUSTOM_HTML);
}
#[test]
fn test_concat_function() {
run_lxml_comparison_with_html(
"//td[concat(@class, '-extra') = 'c1-extra']",
CUSTOM_HTML,
);
}
#[test]
fn test_concat_in_contains() {
run_lxml_comparison_with_html("//li[contains(concat(' ', @class, ' '), ' item ')]", CUSTOM_HTML);
}
#[test]
fn test_substring_before() {
run_lxml_comparison_with_html(
"//li[substring-before(@class, ' ') = 'item']",
CUSTOM_HTML,
);
}
#[test]
fn test_substring_after() {
run_lxml_comparison_with_html(
"//li[substring-after(@class, 'item ') = 'first']",
CUSTOM_HTML,
);
}
#[test]
fn test_floor_function() {
run_lxml_comparison_with_html("//ul/li[floor(last() div 2) = 1]", CUSTOM_HTML);
}
#[test]
fn test_ceiling_function() {
run_lxml_comparison_with_html("//ul/li[ceiling(last() div 2) >= 2]", CUSTOM_HTML);
}
#[test]
fn test_round_function() {
run_lxml_comparison_with_html("//ul/li[position() = round(1.5)]", CUSTOM_HTML);
}
#[test]
fn test_true_function() {
run_lxml_comparison_with_html("//li[true()]", CUSTOM_HTML);
}
#[test]
fn test_not_false_function() {
run_lxml_comparison_with_html("//li[not(false())]", CUSTOM_HTML);
}
#[test]
fn test_name_function() {
run_lxml_comparison_with_html("//*[name() = 'strong']", CUSTOM_HTML);
}
#[test]
fn test_local_name_function() {
run_lxml_comparison_with_html("//*[local-name() = 'em']", CUSTOM_HTML);
}
#[test]
fn test_double_negation() {
run_lxml_comparison_with_html("//li[not(not(@class))]", CUSTOM_HTML);
}
#[test]
fn test_and_or_combined() {
run_lxml_comparison_with_html(
"//li[contains(@class, 'item') and (contains(@class, 'first') or contains(@class, 'last'))]",
CUSTOM_HTML,
);
}
#[test]
fn test_predicate_mixed_functions() {
run_lxml_comparison_with_html(
"//td[string-length(@class) = 2 and starts-with(@class, 'c')]",
CUSTOM_HTML,
);
}
#[test]
fn test_not_with_positional() {
run_lxml_comparison_with_html("//ul/li[not(position() = 1)]", CUSTOM_HTML);
}
#[test]
fn test_triple_union() {
run_lxml_count_comparison_with_html("//strong | //em | //p", CUSTOM_HTML);
}
#[test]
fn test_union_with_predicates() {
run_lxml_count_comparison_with_html(
"//li[@class='item first'] | //td[@class='c1']",
CUSTOM_HTML,
);
}
#[test]
fn test_parent_then_descendant() {
run_lxml_comparison_with_html("//strong/..//em", CUSTOM_HTML);
}
#[test]
fn test_complex_navigation() {
run_lxml_comparison_with_html("//div[@class='content']//strong/../em", CUSTOM_HTML);
}
#[test]
fn test_ancestor_then_descendant() {
run_lxml_comparison_with_html(
"//span[@class='deep']/ancestor::div[@id='a']//span",
CUSTOM_HTML_DEEP,
);
}
#[test]
fn test_table_cell_to_sibling() {
run_lxml_comparison_with_html("//td[@class='c1']/../td[@class='c2']", CUSTOM_HTML);
}
#[test]
fn test_triple_nested_predicate() {
run_lxml_comparison_with_html("//div[div[div[span]]]", CUSTOM_HTML_DEEP);
}
#[test]
fn test_nested_predicate_attr_at_leaf() {
run_lxml_comparison_with_html("//div[div[span[@class='deep']]]", CUSTOM_HTML_DEEP);
}
#[test]
fn test_wildcard_body_children() {
run_lxml_comparison_with_html("/html/body/*", CUSTOM_HTML);
}
#[test]
fn test_wildcard_intermediate_step() {
run_lxml_comparison_with_html("//div[@id='root']/*/li", CUSTOM_HTML);
}
#[test]
fn test_double_wildcard() {
run_lxml_count_comparison_with_html("//table//*", CUSTOM_HTML);
}
#[test]
fn test_string_length_lt() {
run_lxml_comparison_with_html("//span[string-length(@class) < 2]", CUSTOM_HTML);
}
#[test]
fn test_count_gte() {
run_lxml_count_comparison_with_html("//div[count(*) >= 3]", CUSTOM_HTML);
}
#[test]
fn test_position_lte() {
run_lxml_comparison_with_html("//div[@class='siblings']/span[position() <= 2]", CUSTOM_HTML);
}
#[test]
fn test_following_sibling_with_attr() {
run_lxml_comparison_with_html(
"//span[@class='a']/following-sibling::span[@class='c']",
CUSTOM_HTML,
);
}
#[test]
fn test_preceding_sibling_with_attr() {
run_lxml_comparison_with_html(
"//span[@class='c']/preceding-sibling::span[@class='a']",
CUSTOM_HTML,
);
}
#[test]
fn test_ancestor_with_predicate() {
run_lxml_comparison_with_html("//span[@data-x]/ancestor::div[@id]", CUSTOM_HTML);
}
#[test]
fn test_github_role_descendant() {
run_lxml_count_comparison("//main//div[@role]");
}
#[test]
fn test_github_nav_anchors_with_attr() {
run_lxml_comparison("//nav//a[@data-analytics-event]");
}
#[test]
fn test_github_deep_nav_path() {
run_lxml_comparison("//header//nav//ul//li//a");
}
#[test]
fn test_github_multi_attr_predicate() {
run_lxml_count_comparison("//a[@href and @class and @data-analytics-event]");
}
#[test]
fn test_github_nav_anchors_with_text() {
run_lxml_count_comparison("//nav//a[string-length(normalize-space(.)) > 0]");
}
#[test]
fn test_github_class_multi_contains() {
run_lxml_comparison("//div[contains(@class, 'position') and contains(@class, 'relative')]");
}
#[test]
fn test_github_img_ancestor() {
run_lxml_comparison("//img[@alt]/ancestor::a");
}
#[test]
fn test_github_depth_path() {
run_lxml_count_comparison("/html/body/div/div");
}
static CUSTOM_HTML_NUMERIC: &str = r#"<html><body>
<div class="prices">
<span class="price" data-value="10">$10</span>
<span class="price" data-value="25">$25</span>
<span class="price" data-value="5">$5</span>
<span class="price" data-value="100">$100</span>
</div>
<div class="mixed">
<p class="a b c">multi-class</p>
<p class="x">single-class</p>
<p>no-class</p>
<p class="">empty-class</p>
</div>
<div class="empty-children">
<div></div>
<div><span>has child</span></div>
<div></div>
</div>
</body></html>"#;
#[test]
fn test_numeric_attr_gt() {
run_lxml_comparison_with_html("//span[@data-value > 10]", CUSTOM_HTML_NUMERIC);
}
#[test]
fn test_numeric_attr_lt() {
run_lxml_comparison_with_html("//span[@data-value < 25]", CUSTOM_HTML_NUMERIC);
}
#[test]
fn test_numeric_attr_eq() {
run_lxml_comparison_with_html("//span[@data-value = 100]", CUSTOM_HTML_NUMERIC);
}
#[test]
fn test_has_class_including_empty() {
run_lxml_comparison_with_html("//p[@class]", CUSTOM_HTML_NUMERIC);
}
#[test]
fn test_non_empty_class() {
run_lxml_comparison_with_html("//p[string-length(@class) > 0]", CUSTOM_HTML_NUMERIC);
}
#[test]
fn test_no_class_attr() {
run_lxml_comparison_with_html("//p[not(@class)]", CUSTOM_HTML_NUMERIC);
}
#[test]
fn test_empty_divs() {
run_lxml_comparison_with_html(
"//div[@class='empty-children']/div[not(*)]",
CUSTOM_HTML_NUMERIC,
);
}
#[test]
fn test_non_empty_divs() {
run_lxml_comparison_with_html(
"//div[@class='empty-children']/div[*]",
CUSTOM_HTML_NUMERIC,
);
}
#[test]
fn test_numeric_and_string_predicate() {
run_lxml_comparison_with_html(
"//span[@data-value > 5 and contains(., '$')]",
CUSTOM_HTML_NUMERIC,
);
}
#[test]
fn test_sum_function() {
run_lxml_comparison_with_html(
"//div[sum(span/@data-value) > 50]",
CUSTOM_HTML_NUMERIC,
);
}
#[test]
fn test_triple_chained_predicates() {
run_lxml_comparison_with_html(
"//span[@class][contains(@class, 'price')][@data-value]",
CUSTOM_HTML_NUMERIC,
);
}
#[test]
fn test_chained_attr_then_positional() {
run_lxml_comparison_with_html("//span[@class='price'][2]", CUSTOM_HTML_NUMERIC);
}
#[test]
fn test_chained_positional_then_attr() {
run_lxml_comparison_with_html("//span[2][@class='price']", CUSTOM_HTML_NUMERIC);
}