use fhp_core::hash::selector_hash;
use fhp_core::tag::Tag;
use fhp_tree::arena::Arena;
use fhp_tree::node::{NodeFlags, NodeId};
use crate::ast::{
AttrOp, AttrSelector, Combinator, CompoundSelector, Selector, SelectorList, SimpleSelector,
};
use crate::bloom::{AncestorBloom, build_ancestor_blooms, hash_str, hash_tag};
#[inline]
fn match_compound(arena: &Arena, node: NodeId, compound: &CompoundSelector) -> bool {
match compound.parts.as_slice() {
[part] => match_simple(arena, node, part),
parts => parts.iter().all(|part| match_simple(arena, node, part)),
}
}
#[inline]
fn match_simple(arena: &Arena, node: NodeId, selector: &SimpleSelector) -> bool {
let n = arena.get(node);
match selector {
SimpleSelector::Tag(tag) => {
if *tag == Tag::Unknown {
return false;
}
n.tag == *tag
}
SimpleSelector::UnknownTag(tag_name) => {
n.tag == Tag::Unknown
&& arena
.unknown_tag_name(node)
.is_some_and(|name| name.eq_ignore_ascii_case(tag_name))
}
SimpleSelector::Class(class_name, bloom_bit) => {
if n.class_hash & bloom_bit == 0 {
return false;
}
let attrs = arena.attrs(node);
attrs.iter().any(|a| {
arena.attr_name(a).eq_ignore_ascii_case("class")
&& arena
.attr_value(a)
.is_some_and(|v| contains_class_token(v, class_name))
})
}
SimpleSelector::Id(id, target_hash) => {
if n.id_hash == 0 || n.id_hash != *target_hash {
return false;
}
let attrs = arena.attrs(node);
attrs.iter().any(|a| {
arena.attr_name(a).eq_ignore_ascii_case("id")
&& arena.attr_value(a) == Some(id.as_str())
})
}
SimpleSelector::Universal => is_element(n),
SimpleSelector::Attr(attr_sel) => match_attr(arena, node, attr_sel),
SimpleSelector::PseudoFirstChild => is_first_element_child(arena, node),
SimpleSelector::PseudoLastChild => is_last_element_child(arena, node),
SimpleSelector::PseudoNthChild { a, b } => is_nth_element_child(arena, node, *a, *b),
SimpleSelector::PseudoNot(inner) => !match_compound(arena, node, inner),
}
}
#[inline]
fn match_attr(arena: &Arena, node: NodeId, sel: &AttrSelector) -> bool {
let attrs = arena.attrs(node);
for attr in attrs {
if !arena
.attr_name(attr)
.eq_ignore_ascii_case(sel.name.as_str())
{
continue;
}
let val = arena.attr_value(attr);
match sel.op {
AttrOp::Exists => return true,
AttrOp::Equals => {
return val == sel.value.as_deref();
}
AttrOp::Includes => {
if let (Some(v), Some(sel_val)) = (val, &sel.value) {
return contains_class_token(v, sel_val.as_str());
}
}
AttrOp::StartsWith => {
if let (Some(v), Some(sel_val)) = (val, &sel.value) {
return v.starts_with(sel_val.as_str());
}
}
AttrOp::EndsWith => {
if let (Some(v), Some(sel_val)) = (val, &sel.value) {
return v.ends_with(sel_val.as_str());
}
}
AttrOp::Substring => {
if let (Some(v), Some(sel_val)) = (val, &sel.value) {
return v.contains(sel_val.as_str());
}
}
}
}
false
}
#[inline]
fn contains_class_token(haystack: &str, needle: &str) -> bool {
if needle.is_empty() {
return false;
}
let h = haystack.as_bytes();
let n = needle.as_bytes();
let mut i = 0usize;
while i < h.len() {
while i < h.len() && h[i].is_ascii_whitespace() {
i += 1;
}
if i >= h.len() {
break;
}
let start = i;
while i < h.len() && !h[i].is_ascii_whitespace() {
i += 1;
}
let len = i - start;
if len == n.len() && &h[start..i] == n {
return true;
}
}
false
}
fn is_first_element_child(arena: &Arena, node: NodeId) -> bool {
let n = arena.get(node);
if n.parent.is_null() {
return false;
}
let parent = arena.get(n.parent);
let mut child = parent.first_child;
while !child.is_null() {
let c = arena.get(child);
if is_element(c) {
return child == node;
}
child = c.next_sibling;
}
false
}
fn is_last_element_child(arena: &Arena, node: NodeId) -> bool {
let n = arena.get(node);
if n.parent.is_null() {
return false;
}
let parent = arena.get(n.parent);
let mut child = parent.last_child;
while !child.is_null() {
let c = arena.get(child);
if is_element(c) {
return child == node;
}
child = c.prev_sibling;
}
false
}
fn is_nth_element_child(arena: &Arena, node: NodeId, a: i32, b: i32) -> bool {
let n = arena.get(node);
if n.parent.is_null() || n.element_index == 0 {
return false;
}
matches_nth(a, b, n.element_index as i32)
}
#[inline]
fn matches_nth(a: i32, b: i32, index: i32) -> bool {
if a == 0 {
return index == b;
}
let diff = index - b;
if diff == 0 {
return true;
}
diff % a == 0 && (diff / a) >= 0
}
#[inline]
fn is_element(n: &fhp_tree::node::Node) -> bool {
n.depth > 0
&& !n.flags.has(NodeFlags::IS_TEXT)
&& !n.flags.has(NodeFlags::IS_COMMENT)
&& !n.flags.has(NodeFlags::IS_DOCTYPE)
}
pub fn match_selector(arena: &Arena, node: NodeId, selector: &Selector) -> bool {
if !match_compound(arena, node, &selector.subject) {
return false;
}
let mut current = node;
for (combinator, compound) in &selector.chain {
match combinator {
Combinator::Descendant => {
let n = arena.get(current);
let mut ancestor = n.parent;
let mut found = false;
while !ancestor.is_null() {
if match_compound(arena, ancestor, compound) {
current = ancestor;
found = true;
break;
}
ancestor = arena.get(ancestor).parent;
}
if !found {
return false;
}
}
Combinator::Child => {
let n = arena.get(current);
if n.parent.is_null() || !match_compound(arena, n.parent, compound) {
return false;
}
current = n.parent;
}
Combinator::AdjacentSibling => {
let prev = prev_element_sibling(arena, current);
match prev {
Some(p) if match_compound(arena, p, compound) => {
current = p;
}
_ => return false,
}
}
Combinator::GeneralSibling => {
let n = arena.get(current);
let mut prev = n.prev_sibling;
let mut found = false;
while !prev.is_null() {
let p = arena.get(prev);
if is_element(p) && match_compound(arena, prev, compound) {
current = prev;
found = true;
break;
}
prev = p.prev_sibling;
}
if !found {
return false;
}
}
}
}
true
}
pub fn match_selector_list(arena: &Arena, node: NodeId, list: &SelectorList) -> bool {
list.selectors
.iter()
.any(|sel| match_selector(arena, node, sel))
}
fn prev_element_sibling(arena: &Arena, node: NodeId) -> Option<NodeId> {
let n = arena.get(node);
let mut prev = n.prev_sibling;
while !prev.is_null() {
let p = arena.get(prev);
if is_element(p) {
return Some(prev);
}
prev = p.prev_sibling;
}
None
}
pub fn select_all(arena: &Arena, root: NodeId, selector: &Selector) -> Vec<NodeId> {
select_all_with_blooms(arena, root, selector, None)
}
fn select_all_with_blooms(
arena: &Arena,
root: NodeId,
selector: &Selector,
shared_blooms: Option<&[AncestorBloom]>,
) -> Vec<NodeId> {
if let Some(tag) = simple_tag_selector(selector) {
let mut results = Vec::new();
collect_tag(arena, root, tag, &mut results);
return results;
}
if let Some(id) = simple_id_selector(selector) {
let mut results = Vec::new();
collect_id(arena, root, id, &mut results);
return results;
}
let has_descendant = has_descendant_combinator(selector);
if has_descendant {
let local_blooms;
let blooms = if let Some(b) = shared_blooms {
b
} else {
local_blooms = build_ancestor_blooms(arena, root);
&local_blooms
};
let bloom_hashes = compute_descendant_hashes(selector);
let mut results = Vec::new();
collect_bloom(arena, root, selector, blooms, &bloom_hashes, &mut results);
results
} else {
let mut results = Vec::new();
collect_simple(arena, root, selector, &mut results);
results
}
}
pub fn select_all_list(arena: &Arena, root: NodeId, list: &SelectorList) -> Vec<NodeId> {
if list.selectors.len() == 1 {
return select_all(arena, root, &list.selectors[0]);
}
let has_any_descendant = list.selectors.iter().any(has_descendant_combinator);
let shared_blooms = has_any_descendant.then(|| build_ancestor_blooms(arena, root));
let mut results = Vec::new();
let mut seen = std::collections::HashSet::new();
for sel in &list.selectors {
for id in select_all_with_blooms(arena, root, sel, shared_blooms.as_deref()) {
if seen.insert(id) {
results.push(id);
}
}
}
results
}
#[inline]
fn has_descendant_combinator(selector: &Selector) -> bool {
selector
.chain
.iter()
.any(|(c, _)| *c == Combinator::Descendant)
}
pub fn select_first(arena: &Arena, root: NodeId, selector: &Selector) -> Option<NodeId> {
if let Some(tag) = simple_tag_selector(selector) {
return find_first_tag(arena, root, tag);
}
if let Some(id) = simple_id_selector(selector) {
return find_first_id(arena, root, id);
}
find_first(arena, root, selector)
}
pub fn select_first_list(arena: &Arena, root: NodeId, list: &SelectorList) -> Option<NodeId> {
find_first_list(arena, root, list)
}
fn collect_simple(arena: &Arena, node: NodeId, selector: &Selector, results: &mut Vec<NodeId>) {
if node.is_null() {
return;
}
let n = arena.get(node);
if is_element(n) && match_selector(arena, node, selector) {
results.push(node);
}
let mut child = n.first_child;
while !child.is_null() {
collect_simple(arena, child, selector, results);
child = arena.get(child).next_sibling;
}
}
fn collect_id(arena: &Arena, node: NodeId, target_id: &str, results: &mut Vec<NodeId>) {
if node.is_null() {
return;
}
let n = arena.get(node);
if is_element(n) && node_has_id(arena, node, target_id) {
results.push(node);
}
let mut child = n.first_child;
while !child.is_null() {
collect_id(arena, child, target_id, results);
child = arena.get(child).next_sibling;
}
}
fn collect_tag(arena: &Arena, node: NodeId, target_tag: Tag, results: &mut Vec<NodeId>) {
if node.is_null() {
return;
}
let n = arena.get(node);
if is_element(n) && n.tag == target_tag {
results.push(node);
}
let mut child = n.first_child;
while !child.is_null() {
collect_tag(arena, child, target_tag, results);
child = arena.get(child).next_sibling;
}
}
fn collect_bloom(
arena: &Arena,
node: NodeId,
selector: &Selector,
blooms: &[AncestorBloom],
bloom_hashes: &[u32],
results: &mut Vec<NodeId>,
) {
if node.is_null() {
return;
}
let n = arena.get(node);
if is_element(n) {
let bloom = &blooms[node.index()];
let bloom_pass = bloom_hashes.iter().all(|&h| bloom.may_contain(h));
if bloom_pass && match_selector(arena, node, selector) {
results.push(node);
}
}
let mut child = n.first_child;
while !child.is_null() {
collect_bloom(arena, child, selector, blooms, bloom_hashes, results);
child = arena.get(child).next_sibling;
}
}
fn compute_descendant_hashes(selector: &Selector) -> Vec<u32> {
let mut hashes = Vec::new();
for (combinator, compound) in &selector.chain {
if *combinator != Combinator::Descendant {
continue;
}
for part in &compound.parts {
match part {
SimpleSelector::Tag(tag) if *tag != Tag::Unknown => {
hashes.push(hash_tag(*tag));
}
SimpleSelector::UnknownTag(_) => {}
SimpleSelector::Class(class, _) => {
hashes.push(hash_str(class));
}
SimpleSelector::Id(id, _) => {
hashes.push(hash_str(id));
}
_ => {}
}
}
}
hashes
}
fn find_first(arena: &Arena, node: NodeId, selector: &Selector) -> Option<NodeId> {
if node.is_null() {
return None;
}
let n = arena.get(node);
if is_element(n) && match_selector(arena, node, selector) {
return Some(node);
}
let mut child = n.first_child;
while !child.is_null() {
if let Some(found) = find_first(arena, child, selector) {
return Some(found);
}
child = arena.get(child).next_sibling;
}
None
}
fn find_first_id(arena: &Arena, node: NodeId, target_id: &str) -> Option<NodeId> {
if node.is_null() {
return None;
}
let n = arena.get(node);
if is_element(n) && node_has_id(arena, node, target_id) {
return Some(node);
}
let mut child = n.first_child;
while !child.is_null() {
if let Some(found) = find_first_id(arena, child, target_id) {
return Some(found);
}
child = arena.get(child).next_sibling;
}
None
}
fn find_first_tag(arena: &Arena, node: NodeId, target_tag: Tag) -> Option<NodeId> {
if node.is_null() {
return None;
}
let n = arena.get(node);
if is_element(n) && n.tag == target_tag {
return Some(node);
}
let mut child = n.first_child;
while !child.is_null() {
if let Some(found) = find_first_tag(arena, child, target_tag) {
return Some(found);
}
child = arena.get(child).next_sibling;
}
None
}
fn find_first_list(arena: &Arena, node: NodeId, list: &SelectorList) -> Option<NodeId> {
if node.is_null() {
return None;
}
let n = arena.get(node);
if is_element(n) && match_selector_list(arena, node, list) {
return Some(node);
}
let mut child = n.first_child;
while !child.is_null() {
if let Some(found) = find_first_list(arena, child, list) {
return Some(found);
}
child = arena.get(child).next_sibling;
}
None
}
#[inline]
fn simple_id_selector(selector: &Selector) -> Option<&str> {
if !selector.chain.is_empty() {
return None;
}
match selector.subject.parts.as_slice() {
[SimpleSelector::Id(id, _)] => Some(id.as_str()),
_ => None,
}
}
#[inline]
fn simple_tag_selector(selector: &Selector) -> Option<Tag> {
if !selector.chain.is_empty() {
return None;
}
match selector.subject.parts.as_slice() {
[SimpleSelector::Tag(tag)] if *tag != Tag::Unknown => Some(*tag),
_ => None,
}
}
#[inline]
fn node_has_id(arena: &Arena, node: NodeId, target_id: &str) -> bool {
let n = arena.get(node);
let target_hash = selector_hash(target_id.as_bytes());
if n.id_hash == 0 || n.id_hash != target_hash {
return false;
}
arena.attrs(node).iter().any(|a| {
arena.attr_name(a).eq_ignore_ascii_case("id") && arena.attr_value(a) == Some(target_id)
})
}
#[cfg(test)]
mod tests {
use super::*;
use crate::parser::parse_selector;
fn parse_and_match(html: &str, css: &str) -> Vec<NodeId> {
let doc = fhp_tree::parse(html).unwrap();
let list = parse_selector(css).unwrap();
select_all_list(doc.arena(), doc.root_id(), &list)
}
#[test]
fn match_tag() {
let ids = parse_and_match("<div><p>text</p></div>", "p");
assert_eq!(ids.len(), 1);
}
#[test]
fn match_tag_multiple_nodes() {
let ids = parse_and_match("<div><p>a</p><p>b</p></div>", "p");
assert_eq!(ids.len(), 2);
}
#[test]
fn match_tag_first_is_document_order() {
let doc = fhp_tree::parse("<div><p>a</p><p>b</p></div>").unwrap();
let list = parse_selector("p").unwrap();
let first = select_first_list(doc.arena(), doc.root_id(), &list).unwrap();
assert_eq!(doc.get(first).text_content(), "a");
}
#[test]
fn match_class() {
let ids = parse_and_match("<div class=\"a\"><span class=\"b\">x</span></div>", ".b");
assert_eq!(ids.len(), 1);
}
#[test]
fn match_id() {
let ids = parse_and_match("<div id=\"main\">x</div>", "#main");
assert_eq!(ids.len(), 1);
}
#[test]
fn match_id_duplicates() {
let ids = parse_and_match("<div id=\"dup\">a</div><span id=\"dup\">b</span>", "#dup");
assert_eq!(ids.len(), 2);
}
#[test]
fn match_id_first_is_document_order() {
let doc = fhp_tree::parse("<div id=\"dup\">a</div><span id=\"dup\">b</span>").unwrap();
let list = parse_selector("#dup").unwrap();
let first = select_first_list(doc.arena(), doc.root_id(), &list).unwrap();
assert_eq!(doc.get(first).text_content(), "a");
}
#[test]
fn match_universal() {
let ids = parse_and_match("<div><p>text</p></div>", "*");
assert_eq!(ids.len(), 2);
}
#[test]
fn match_descendant() {
let ids = parse_and_match("<div><p>a</p></div><p>b</p>", "div p");
assert_eq!(ids.len(), 1);
}
#[test]
fn match_descendant_with_unknown_ancestor() {
let ids = parse_and_match(
"<my-widget><span>a</span></my-widget><span>b</span>",
"my-widget span",
);
assert_eq!(ids.len(), 1);
}
#[test]
fn match_child() {
let ids = parse_and_match("<div><p>a</p><span><p>b</p></span></div>", "div > p");
assert_eq!(ids.len(), 1);
}
#[test]
fn match_adjacent_sibling() {
let ids = parse_and_match(
"<div><h1>title</h1><p>first</p><p>second</p></div>",
"h1 + p",
);
assert_eq!(ids.len(), 1);
}
#[test]
fn match_general_sibling() {
let ids = parse_and_match(
"<div><h1>title</h1><p>first</p><p>second</p></div>",
"h1 ~ p",
);
assert_eq!(ids.len(), 2);
}
#[test]
fn match_compound_tag_class() {
let ids = parse_and_match("<div class=\"a\"><div class=\"b\">x</div></div>", "div.b");
assert_eq!(ids.len(), 1);
}
#[test]
fn match_attr_exists() {
let ids = parse_and_match("<a href=\"x\">link</a><span>text</span>", "[href]");
assert_eq!(ids.len(), 1);
}
#[test]
fn match_attr_equals() {
let ids = parse_and_match("<a href=\"x\">a</a><a href=\"y\">b</a>", "[href=\"x\"]");
assert_eq!(ids.len(), 1);
}
#[test]
fn match_attr_starts_with() {
let ids = parse_and_match(
"<a href=\"https://a\">a</a><a href=\"http://b\">b</a>",
"[href^=\"https\"]",
);
assert_eq!(ids.len(), 1);
}
#[test]
fn match_attr_ends_with() {
let ids = parse_and_match(
"<a href=\"a.html\">a</a><a href=\"b.php\">b</a>",
"[href$=\".html\"]",
);
assert_eq!(ids.len(), 1);
}
#[test]
fn match_attr_substring() {
let ids = parse_and_match(
"<a href=\"https://example.com\">a</a><a href=\"other\">b</a>",
"[href*=\"example\"]",
);
assert_eq!(ids.len(), 1);
}
#[test]
fn match_first_child() {
let ids = parse_and_match("<ul><li>1</li><li>2</li><li>3</li></ul>", "li:first-child");
assert_eq!(ids.len(), 1);
}
#[test]
fn match_last_child() {
let ids = parse_and_match("<ul><li>1</li><li>2</li><li>3</li></ul>", "li:last-child");
assert_eq!(ids.len(), 1);
}
#[test]
fn match_nth_child_odd() {
let ids = parse_and_match(
"<ul><li>1</li><li>2</li><li>3</li><li>4</li></ul>",
"li:nth-child(odd)",
);
assert_eq!(ids.len(), 2); }
#[test]
fn match_nth_child_even() {
let ids = parse_and_match(
"<ul><li>1</li><li>2</li><li>3</li><li>4</li></ul>",
"li:nth-child(even)",
);
assert_eq!(ids.len(), 2); }
#[test]
fn match_nth_child_number() {
let ids = parse_and_match("<ul><li>1</li><li>2</li><li>3</li></ul>", "li:nth-child(2)");
assert_eq!(ids.len(), 1);
}
#[test]
fn match_not() {
let ids = parse_and_match(
"<div class=\"a\">x</div><div class=\"b\">y</div><div>z</div>",
"div:not(.a)",
);
assert_eq!(ids.len(), 2);
}
#[test]
fn match_comma_list() {
let ids = parse_and_match("<div>a</div><span>b</span><p>c</p>", "div, span");
assert_eq!(ids.len(), 2);
}
#[test]
fn matches_nth_formula() {
assert!(matches_nth(2, 1, 1));
assert!(!matches_nth(2, 1, 2));
assert!(matches_nth(2, 1, 3));
assert!(!matches_nth(0, 3, 1));
assert!(matches_nth(0, 3, 3));
assert!(matches_nth(3, 0, 3));
assert!(matches_nth(3, 0, 6));
assert!(!matches_nth(3, 0, 1));
}
#[test]
fn class_token_exact_word() {
assert!(contains_class_token("a b c", "b"));
assert!(!contains_class_token("a bc d", "b"));
assert!(!contains_class_token("abc", "b"));
}
#[test]
fn class_token_ascii_whitespace() {
assert!(contains_class_token("a\tb\nc", "b"));
assert!(contains_class_token(" a b ", "a"));
assert!(!contains_class_token(" a b ", "c"));
}
}