use std::char;
use crate::input_stream::InputStream;
use crate::{Node, NodeKind};
pub(crate) struct CompiledSelector {
checks: Vec<NodeCheck>,
}
struct NodeCheck {
selector: CompoundSelector,
combinator: Combinator,
}
struct CompoundSelector {
tag: TagCheck,
attrs: Vec<AttributeCheck>,
}
struct TagCheck {
op: TagOperator,
value: String,
}
enum TagOperator {
Any,
Eq,
}
struct AttributeCheck {
name: String,
value: String,
op: AttributeOperator,
}
enum AttributeOperator {
Eq,
Exists,
WsSeparated,
HyphenSuffixed,
Prefixed,
Suffixed,
Contained,
}
#[derive(PartialEq, Eq)]
enum Combinator {
Identity,
Parent,
Ancestors,
LeftSibling,
PrecedingSiblings,
}
#[derive(Debug, PartialEq, Eq)]
pub(crate) enum ParserError {
UnrecognizedAttributeSelector,
UnexpectedAttributeValue,
UnexpectedEndOfAttributeSelector,
MissingSelector,
MissingCombinator,
TrailingCombinator,
UnterminatedString,
UnescapedNewlineInString,
UnexpectedEofInEscapedCodepoint,
}
pub(crate) fn parse_selector(selector: &str) -> Result<CompiledSelector, ParserError> {
let mut input = InputStream::new(selector);
let mut checks = Vec::new();
loop {
skip_spaces(&mut input);
if input.peek().is_none() {
break;
}
let mut attr_checks: Vec<AttributeCheck> = Vec::new();
let mut tag_check: Option<TagCheck> = None;
if input.peek() == Some('*') {
input.consume();
tag_check = Some(TagCheck {
op: TagOperator::Any,
value: String::new(),
});
} else if input.peek().is_some_and(is_ident) {
let ident = parse_identifier(&mut input)?;
tag_check = Some(TagCheck {
op: TagOperator::Eq,
value: ident.to_lowercase(),
})
}
loop {
match input.peek() {
Some('#') => {
input.consume();
let ident = parse_identifier(&mut input)?;
attr_checks.push(AttributeCheck {
name: String::from("id"),
op: AttributeOperator::Eq,
value: ident,
});
}
Some('.') => {
input.consume();
let ident = parse_identifier(&mut input)?;
attr_checks.push(AttributeCheck {
name: String::from("class"),
op: AttributeOperator::Eq,
value: ident,
});
}
Some('[') => {
input.consume();
let mut name = parse_identifier(&mut input)?;
name.make_ascii_lowercase();
if input.peek() == Some(']') {
input.consume();
attr_checks.push(AttributeCheck {
name,
op: AttributeOperator::Exists,
value: String::new(),
});
continue;
}
let op = if input.peek() == Some('=') {
input.consume();
AttributeOperator::Eq
} else {
let op = match input.lookahead(2) {
"~=" => AttributeOperator::WsSeparated,
"|=" => AttributeOperator::HyphenSuffixed,
"^=" => AttributeOperator::Prefixed,
"$=" => AttributeOperator::Suffixed,
"*=" => AttributeOperator::Contained,
_ => return Err(ParserError::UnrecognizedAttributeSelector),
};
input.advance(2);
op
};
let value = match input.peek() {
Some('"') => {
input.consume();
parse_string_token(&mut input, '"')?
}
Some('\'') => {
input.consume();
parse_string_token(&mut input, '\'')?
}
Some(c) if is_ident_start(c) => parse_identifier(&mut input)?,
_ => {
return Err(ParserError::UnexpectedAttributeValue);
}
};
if input.consume() != Some(']') {
return Err(ParserError::UnexpectedEndOfAttributeSelector);
}
attr_checks.push(AttributeCheck { name, op, value })
}
_ => break,
}
}
if attr_checks.is_empty() && tag_check.is_none() {
return Err(ParserError::MissingSelector);
}
if !attr_checks.is_empty() && tag_check.is_none() {
tag_check = Some(TagCheck {
op: TagOperator::Any,
value: String::new(),
});
}
let saw_space = skip_spaces(&mut input) > 0;
let next = input.peek();
let combinator = if next == Some('>') {
input.consume();
Combinator::Parent
} else if next == Some('+') {
input.consume();
Combinator::LeftSibling
} else if next == Some('~') {
input.consume();
Combinator::PrecedingSiblings
} else if next.is_none() {
input.consume();
Combinator::Identity
} else if saw_space {
Combinator::Ancestors
} else {
return Err(ParserError::MissingCombinator);
};
checks.push(NodeCheck {
combinator,
selector: CompoundSelector {
tag: tag_check.unwrap(),
attrs: attr_checks,
},
})
}
if !checks.is_empty() && checks.last().unwrap().combinator != Combinator::Identity {
return Err(ParserError::TrailingCombinator);
}
Ok(CompiledSelector { checks })
}
fn parse_string_token(input: &mut InputStream, end: char) -> Result<String, ParserError> {
let mut buffer = String::new();
loop {
match input.consume() {
Some(c) if c == end => {
return Ok(buffer);
}
None => {
return Err(ParserError::UnterminatedString);
}
Some('\n') => {
return Err(ParserError::UnescapedNewlineInString);
}
Some('\\') => match input.peek() {
None => {}
Some('\n') => {
input.consume();
}
_ => {
let codepoint = parse_escaped_codepoint(input)?;
buffer.push(codepoint);
}
},
Some(c) => {
buffer.push(c);
}
}
}
}
fn parse_identifier(input: &mut InputStream) -> Result<String, ParserError> {
let mut buffer = String::new();
loop {
let curr = input.peek();
if curr.is_some_and(is_ident) {
input.consume();
buffer.push(curr.unwrap());
} else if starts_with_valid_escape(input.lookahead(2)) {
input.consume();
let codepoint = parse_escaped_codepoint(input)?;
buffer.push(codepoint);
} else {
return Ok(buffer);
}
}
}
fn parse_escaped_codepoint(input: &mut InputStream) -> Result<char, ParserError> {
let curr = input.consume();
let mut buffer = String::new();
match curr {
Some(c) if c.is_ascii_hexdigit() => {
buffer.push(c);
for _ in 0..5 {
if let Some(next) = input.peek()
&& next.is_ascii_hexdigit()
{
input.consume();
buffer.push(next);
continue;
}
break;
}
if input.peek().is_some_and(is_whitespace) {
input.consume();
}
let Some(codepoint) = u32::from_str_radix(&buffer, 16)
.ok()
.and_then(char::from_u32)
else {
return Ok(char::REPLACEMENT_CHARACTER);
};
Ok(codepoint)
}
None => Err(ParserError::UnexpectedEofInEscapedCodepoint),
Some(c) => Ok(c),
}
}
fn starts_with_valid_escape(input: &str) -> bool {
if input.len() < 2 {
return false;
}
let bytes = input.as_bytes();
if bytes[0] != b'\\' {
return false;
}
if bytes[1] == b'\n' {
return false;
}
true
}
fn skip_spaces(input: &mut InputStream) -> usize {
let mut count = 0;
loop {
if input.peek() == Some(' ') {
count += 1;
input.consume();
continue;
}
break;
}
count
}
fn is_whitespace(c: char) -> bool {
c == ' ' || c == '\t' || c == '\n'
}
fn is_ident_start(c: char) -> bool {
c.is_ascii_alphabetic() || c == '_'
}
fn is_ident(c: char) -> bool {
is_ident_start(c) || c.is_ascii_digit() || c == '-'
}
impl CompiledSelector {
pub(crate) fn new(selector: &str) -> Result<CompiledSelector, ParserError> {
parse_selector(selector)
}
pub(crate) fn matches<'a>(&self, node: Node<'a>) -> bool {
matches_node(self.checks.as_slice(), node)
}
}
fn matches_node<'a>(checks: &[NodeCheck], node: Node<'a>) -> bool {
let Some((current_check, rest)) = checks.split_last() else {
return true;
};
for n in current_check.combinator.apply(node) {
if current_check.selector.test(n) && matches_node(rest, n) {
return true;
}
}
false
}
impl Combinator {
fn apply<'a>(&self, node: Node<'a>) -> Box<dyn Iterator<Item = Node<'a>> + 'a> {
match self {
Combinator::Identity => Box::new(std::iter::once(node)),
Combinator::Ancestors => Box::new(node.ancestors()),
Combinator::Parent => Box::new(node.parent().into_iter()),
Combinator::LeftSibling => {
let closest_element = node.preceding().find(|n| n.kind() == NodeKind::Element);
Box::new(closest_element.into_iter())
}
Combinator::PrecedingSiblings => Box::new(node.preceding()),
}
}
}
impl CompoundSelector {
fn test<'a>(&self, node: Node<'a>) -> bool {
if node.kind() != NodeKind::Element {
return false;
}
if !self.tag.test(node) {
return false;
}
for attr_test in &self.attrs {
if !attr_test.test(node) {
return false;
}
}
true
}
}
impl TagCheck {
fn test<'a>(&self, node: Node<'a>) -> bool {
match self.op {
TagOperator::Any => true,
TagOperator::Eq => *node.name() == self.value,
}
}
}
impl AttributeCheck {
fn test<'a>(&self, node: Node<'a>) -> bool {
match self.op {
AttributeOperator::Exists => node.attr(&self.name).is_some(),
AttributeOperator::Eq => {
let Some(value) = node.attr(&self.name) else {
return false;
};
self.value == *value
}
AttributeOperator::WsSeparated => {
let Some(value) = node.attr(&self.name) else {
return false;
};
value
.split_ascii_whitespace()
.any(|part| part == self.value)
}
AttributeOperator::HyphenSuffixed => {
let Some(value) = node.attr(&self.name) else {
return false;
};
self.value == value.split("-").nth(0).unwrap()
}
AttributeOperator::Prefixed => {
let Some(value) = node.attr(&self.name) else {
return false;
};
value.starts_with(&self.value)
}
AttributeOperator::Suffixed => {
let Some(value) = node.attr(&self.name) else {
return false;
};
value.ends_with(&self.value)
}
AttributeOperator::Contained => {
let Some(value) = node.attr(&self.name) else {
return false;
};
value.contains(&self.value)
}
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::fmt::Write as _;
fn debug_print_tag_operation(op: TagOperator) -> &'static str {
match op {
TagOperator::Any => "*",
TagOperator::Eq => "EQ",
}
}
fn debug_print_attribute_operator(op: AttributeOperator) -> &'static str {
match op {
AttributeOperator::Contained => "CONTAINED",
AttributeOperator::Eq => "EQ",
AttributeOperator::Exists => "EXISTS",
AttributeOperator::HyphenSuffixed => "HYPHEN_SEPARATED",
AttributeOperator::Prefixed => "PREFIXED",
AttributeOperator::Suffixed => "SUFFIXED",
AttributeOperator::WsSeparated => "WS_SEPARATED",
}
}
fn debug_print_combinator(op: Combinator) -> &'static str {
match op {
Combinator::Ancestors => "ANCESTORS",
Combinator::Identity => "IDENTITY",
Combinator::LeftSibling => "LEFT_SIBLING",
Combinator::Parent => "PARENT",
Combinator::PrecedingSiblings => "PRECEDING_SIBLINGS",
}
}
fn debug_print_selector(selector: CompiledSelector) -> Vec<String> {
let mut res = Vec::new();
for check in selector.checks {
let mut buf = String::new();
write!(
&mut buf,
"[tag={},op={}",
check.selector.tag.value,
debug_print_tag_operation(check.selector.tag.op)
)
.unwrap();
for attr_check in check.selector.attrs {
write!(
&mut buf,
"|attr={},value={},op={}",
attr_check.name,
attr_check.value,
debug_print_attribute_operator(attr_check.op)
)
.unwrap();
}
write!(&mut buf, "] {}", debug_print_combinator(check.combinator)).unwrap();
res.push(buf);
}
res
}
#[track_caller]
fn check(selector: &str, expected: Vec<&str>) {
let compiled = match parse_selector(selector) {
Ok(a) => a,
Err(err) => {
panic!("unexpected ParserError::or: {:?}", err)
}
};
assert_eq!(debug_print_selector(compiled), expected,);
}
#[track_caller]
fn check_err(selector: &str, expected_err: ParserError) {
match parse_selector(selector) {
Ok(a) => {
panic!(
"expected ParserError::or, got {:?}",
debug_print_selector(a)
);
}
Err(err) => {
assert_eq!(err, expected_err);
}
};
}
#[test]
fn selectortest() {
check("", vec![]);
check(" ", vec![]);
check("*", vec!["[tag=,op=*] IDENTITY"]);
check("div", vec!["[tag=div,op=EQ] IDENTITY"]);
check("#div", vec!["[tag=,op=*|attr=id,value=div,op=EQ] IDENTITY"]);
check(
".div",
vec!["[tag=,op=*|attr=class,value=div,op=EQ] IDENTITY"],
);
check(
"[id]",
vec!["[tag=,op=*|attr=id,value=,op=EXISTS] IDENTITY"],
);
check(
"[id=test]",
vec!["[tag=,op=*|attr=id,value=test,op=EQ] IDENTITY"],
);
check(
r#"[id="test"]"#,
vec!["[tag=,op=*|attr=id,value=test,op=EQ] IDENTITY"],
);
check(
r#"[id='test']"#,
vec!["[tag=,op=*|attr=id,value=test,op=EQ] IDENTITY"],
);
check(
"[id~=test]",
vec!["[tag=,op=*|attr=id,value=test,op=WS_SEPARATED] IDENTITY"],
);
check(
"[id|=test]",
vec!["[tag=,op=*|attr=id,value=test,op=HYPHEN_SEPARATED] IDENTITY"],
);
check(
"[id^=test]",
vec!["[tag=,op=*|attr=id,value=test,op=PREFIXED] IDENTITY"],
);
check(
"[id$=test]",
vec!["[tag=,op=*|attr=id,value=test,op=SUFFIXED] IDENTITY"],
);
check(
"[id*=test]",
vec!["[tag=,op=*|attr=id,value=test,op=CONTAINED] IDENTITY"],
);
check(
".class#id[attr]",
vec![
r#"[tag=,op=*|attr=class,value=class,op=EQ|attr=id,value=id,op=EQ|attr=attr,value=,op=EXISTS] IDENTITY"#,
],
);
check(
"[attr].class#id",
vec![
r#"[tag=,op=*|attr=attr,value=,op=EXISTS|attr=class,value=class,op=EQ|attr=id,value=id,op=EQ] IDENTITY"#,
],
);
check(
"#id[attr].class",
vec![
r#"[tag=,op=*|attr=id,value=id,op=EQ|attr=attr,value=,op=EXISTS|attr=class,value=class,op=EQ] IDENTITY"#,
],
);
check(
"* #test",
vec![
r#"[tag=,op=*] ANCESTORS"#,
r#"[tag=,op=*|attr=id,value=test,op=EQ] IDENTITY"#,
],
);
check(
"div > .test",
vec![
r#"[tag=div,op=EQ] PARENT"#,
r#"[tag=,op=*|attr=class,value=test,op=EQ] IDENTITY"#,
],
);
check(
"div>.test",
vec![
r#"[tag=div,op=EQ] PARENT"#,
r#"[tag=,op=*|attr=class,value=test,op=EQ] IDENTITY"#,
],
);
check(
"[data-selected] + *",
vec![
r#"[tag=,op=*|attr=data-selected,value=,op=EXISTS] LEFT_SIBLING"#,
r#"[tag=,op=*] IDENTITY"#,
],
);
check(
"[data-selected]+*",
vec![
r#"[tag=,op=*|attr=data-selected,value=,op=EXISTS] LEFT_SIBLING"#,
r#"[tag=,op=*] IDENTITY"#,
],
);
check(
"div ~ span",
vec![
r#"[tag=div,op=EQ] PRECEDING_SIBLINGS"#,
r#"[tag=span,op=EQ] IDENTITY"#,
],
);
check(
"div~span",
vec![
r#"[tag=div,op=EQ] PRECEDING_SIBLINGS"#,
r#"[tag=span,op=EQ] IDENTITY"#,
],
);
check(
r#"[id=te\'st]"#,
vec![r#"[tag=,op=*|attr=id,value=te'st,op=EQ] IDENTITY"#],
);
check(
r#"[id=te\1F47Bst]"#,
vec![r#"[tag=,op=*|attr=id,value=te👻st,op=EQ] IDENTITY"#],
);
check(
r#"[id=te\01F47Bst]"#,
vec![r#"[tag=,op=*|attr=id,value=te👻st,op=EQ] IDENTITY"#],
);
check(
r#"[id=te\1F47B st]"#,
vec![r#"[tag=,op=*|attr=id,value=te👻st,op=EQ] IDENTITY"#],
);
check(
r#"[id="te\'st"]"#,
vec![r#"[tag=,op=*|attr=id,value=te'st,op=EQ] IDENTITY"#],
);
check(
r#"[id="te\1F47Bst"]"#,
vec![r#"[tag=,op=*|attr=id,value=te👻st,op=EQ] IDENTITY"#],
);
check(
r#"[id="te\01F47Bst"]"#,
vec![r#"[tag=,op=*|attr=id,value=te👻st,op=EQ] IDENTITY"#],
);
check(
r#"[id="te\01F47BAAst"]"#,
vec![r#"[tag=,op=*|attr=id,value=te👻AAst,op=EQ] IDENTITY"#],
);
check(
r#"[id="te\1F47B st"]"#,
vec![r#"[tag=,op=*|attr=id,value=te👻st,op=EQ] IDENTITY"#],
);
check(
r#"[id='te\'st']"#,
vec![r#"[tag=,op=*|attr=id,value=te'st,op=EQ] IDENTITY"#],
);
check(
r#"[id='te\1F47Bst']"#,
vec![r#"[tag=,op=*|attr=id,value=te👻st,op=EQ] IDENTITY"#],
);
check(
r#"[id='te\01F47Bst']"#,
vec![r#"[tag=,op=*|attr=id,value=te👻st,op=EQ] IDENTITY"#],
);
check(
r#"[id='te\1F47B st']"#,
vec![r#"[tag=,op=*|attr=id,value=te👻st,op=EQ] IDENTITY"#],
);
check(r#"ya\'ll"#, vec![r#"[tag=ya'll,op=EQ] IDENTITY"#]);
check(r#"ya\1F44Bll"#, vec![r#"[tag=ya👋ll,op=EQ] IDENTITY"#]);
check(
r#".ya\'ll"#,
vec![r#"[tag=,op=*|attr=class,value=ya'll,op=EQ] IDENTITY"#],
);
check(
r#".ya\1F44Bll"#,
vec![r#"[tag=,op=*|attr=class,value=ya👋ll,op=EQ] IDENTITY"#],
);
check(
r#"#ya\'ll"#,
vec![r#"[tag=,op=*|attr=id,value=ya'll,op=EQ] IDENTITY"#],
);
check(
r#"#ya\1F44Bll"#,
vec![r#"[tag=,op=*|attr=id,value=ya👋ll,op=EQ] IDENTITY"#],
);
check_err("p +", ParserError::TrailingCombinator);
check_err("+ p", ParserError::MissingSelector);
check_err(",", ParserError::MissingSelector);
check_err("[a!=b]", ParserError::UnrecognizedAttributeSelector);
check_err("[a=!b]", ParserError::UnexpectedAttributeValue);
check_err("[a=b ]", ParserError::UnexpectedEndOfAttributeSelector);
check_err("[a=b", ParserError::UnexpectedEndOfAttributeSelector);
check_err("[a='b]", ParserError::UnterminatedString);
check_err("[a='b\n']", ParserError::UnescapedNewlineInString);
check_err("di\\", ParserError::MissingCombinator);
}
}