use std::sync::LazyLock;
use regex::Regex;
#[allow(dead_code)]
static XPATH_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r#"(?x) # Enable verbose mode
(?P<node> # Start of 'node' named capture group
( # Start of main alternation
^id\(["']? # Match '^id(' or '^id["']
(?P<idvalue> # Start of 'idvalue' named capture group
\s*[\w/:] # Optional whitespace followed by a word character, '/', or ':'
[-/\w\s,:;.+]* # Match allowed characters
) # End of 'idvalue' group
["']?\) # Optional closing quote and parenthesis
| # OR
(?P<attrnav>//?) # 'attrnav' named capture group: '/' or '//'
(?P<attraccessor> # 'attraccessor' named capture group
@\w+[-_\w]* # Literal '@' followed by a word character and allowed characters
) # End of 'attraccessor' group
| # OR
(?P<textnav>//?) # 'textnav' named capture group: '/' or '//'
(?P<textaccessor> # 'textaccessor' named capture group
text\(\) # Literal 'text()'
) # End of 'textaccessor' group
| # OR
(?P<tagnav>//?) # 'tagnav' named capture group: '/' or '//'
(?P<tag> # Start of 'tag' named capture group
([a-zA-Z][a-zA-Z0-9]{0,10}|\*) # Match a tag with 1-11 alphanumerics or '*'
) # End of 'tag' group
( # Start of optional attributes
\[ # Literal '['
( # Start of attribute alternation
(?P<matched> # 'matched' named capture group
(?P<mattr>@?[.a-zA-Z_:] # 'mattr' named capture group
[-\w:.]*(\(\))? # Match allowed characters and optional '()'
) # End of 'mattr' group
=["'] # Equal sign followed by a quote
(?P<mvalue> # 'mvalue' named capture group
\s*[\w/:] # Optional whitespace and word character, '/', or ':'
[-/\w\s,:;.+]* # Match allowed characters
) # End of 'mvalue' group
["'] # Closing quote
) # End of 'matched' group
| # OR
(?P<contained> # 'contained' named capture group
contains\( # Literal 'contains('
(?P<cattr>@?[.a-zA-Z_:] # 'cattr' named capture group
[-\w:.]*(\(\))? # Match allowed characters and optional '()'
), # Comma
\s*["'] # Optional whitespace, and quote
(?P<cvalue> # 'cvalue' named capture group
\s*[\w/:] # Optional whitespace and word character, '/', or ':'
[-/\w\s,:;.+]* # Match allowed characters
) # End of 'cvalue' group
["'] # Closing quote
\) # Literal ')'
) # End of 'contained' group
) # End of attribute alternation
\] # Literal ']'
)? # End of optional attributes
(\[(?P<nth>\d|last\(\)|first\(\))\])? # Optional '[nth|last()|first()]' named capture group
) # End of main alternation
)"#)
.expect("Invalid regex pattern")
});
#[derive(Debug, PartialEq)]
pub enum NodeAccessor {
Attribute(String),
Text { recursive: bool },
Node,
}
pub fn parse_xpath<S: AsRef<str>>(xpath: S) -> Option<(String, NodeAccessor)> {
let xpath = xpath
.as_ref()
.trim();
let reg = &*XPATH_PATTERN;
let normalized_xpath = if xpath.starts_with("/") || xpath.starts_with(".") {
xpath
} else {
&("./".to_string() + xpath)
};
let mut selector = String::new();
let mut position = 0;
let mut accessor = NodeAccessor::Node;
while position < normalized_xpath.len() {
let caps = reg.captures(&normalized_xpath[position..])?;
let found = reg.find(&normalized_xpath[position..])?;
if let Some(attr_match) = caps.name("attraccessor") {
accessor = NodeAccessor::Attribute(
attr_match
.as_str()
.trim_start_matches("@")
.to_string()
);
break;
}
if caps.name("textaccessor").is_some() {
accessor = NodeAccessor::Text {
recursive: caps.name("textnav")?.as_str() == "//"
};
break;
}
let nav = if position == 0 {
""
} else if caps.name("tagnav")?.as_str() == "//" {
" > "
} else {
" "
};
let tag_name = caps.name("tag").map_or("", |m| m.as_str());
let tag = if tag_name == "*" { "" } else { tag_name };
let idvalue = caps.name("idvalue");
let matched = caps.name("matched");
let contained = caps.name("contained");
let attr = match (idvalue, matched, contained) {
(Some(id), _, _) => format!("#{}", id.as_str().replace(' ', "#")),
(_, Some(_), _) => {
let mattr = caps.name("mattr")?.as_str();
let mvalue = caps.name("mvalue")?.as_str();
match mattr {
"@id" => format!("#{}", mvalue.replace(' ', "#")),
"@class" => format!(".{}", mvalue.replace(' ', ".")),
"text()" | "." => format!(":contains(^{}$)", mvalue),
_ if !mattr.is_empty() => {
format!("[{}={}]", mattr.replace('@', ""), format!("\"{}\"", mvalue.replace('"', "\\\"")))
}
_ => String::new(),
}
}
(_, _, Some(_)) => {
let cattr = caps.name("cattr")?.as_str();
let cvalue = caps.name("cvalue")?.as_str();
if cattr.starts_with('@') {
format!("[{}*={}]", cattr.replace('@', ""), cvalue)
} else if cattr == "text()" {
format!(":contains({})", cvalue)
} else {
String::new()
}
}
_ => String::new(),
};
let nth = caps.name("nth")
.map_or(String::new(), |n| match n.as_str() {
"last()" => ":last-of-type".to_string(),
"first()" => ":first-of-type".to_string(),
idx if idx.chars().all(|c| c.is_ascii_digit()) => format!(":nth-of-type({})", idx),
_ => String::new(),
});
selector.push_str(&format!("{}{}{}{}", nav, tag, attr, nth));
position += found.end();
}
Some((selector, accessor))
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_parse_xpath() {
assert_eq!(
parse_xpath("//meta[@property='og:description']"),
Some(("meta[property=\"og:description\"]".to_string(), NodeAccessor::Node))
);
assert_eq!(
parse_xpath(r#"//meta[@property="og:description"]"#),
Some(("meta[property=\"og:description\"]".to_string(), NodeAccessor::Node))
);
assert_eq!(
parse_xpath("/html/body/div[1]"),
Some(("html body div:nth-of-type(1)".to_string(), NodeAccessor::Node))
);
assert_eq!(
parse_xpath("/html/body/div[last()]"),
Some(("html body div:last-of-type".to_string(), NodeAccessor::Node))
);
assert_eq!(
parse_xpath("/html/body/div[first()]"),
Some(("html body div:first-of-type".to_string(), NodeAccessor::Node))
);
assert_eq!(
parse_xpath("//div[@id='main']"),
Some(("div#main".to_string(), NodeAccessor::Node))
);
assert_eq!(
parse_xpath("//div[contains(@class, 'content')]"),
Some(("div[class*=content]".to_string(), NodeAccessor::Node))
);
assert_eq!(
parse_xpath("//div[contains(text(), 'Hello')]"),
Some(("div:contains(Hello)".to_string(), NodeAccessor::Node))
);
assert_eq!(
parse_xpath("//div[@class='content']//div//li[@id='name']"),
Some(("div.content > div > li#name".to_string(), NodeAccessor::Node))
);
assert_eq!(
parse_xpath("//div[@id='main']//@src"),
Some(("div#main".to_string(), NodeAccessor::Attribute("src".to_string())))
);
assert_eq!(
parse_xpath("//span[@class='name']/@data-name"),
Some(("span.name".to_string(), NodeAccessor::Attribute("data-name".to_string())))
);
assert_eq!(
parse_xpath("/html/text()"),
Some(("html".to_string(), NodeAccessor::Text { recursive: false }))
);
assert_eq!(
parse_xpath("//html//div//text()"),
Some(("html > div".to_string(), NodeAccessor::Text { recursive: true }))
);
assert_eq!(
parse_xpath("//div[contains(text(), 'Hello')]//text()"),
Some(("div:contains(Hello)".to_string(), NodeAccessor::Text { recursive: true }))
);
assert_eq!(
parse_xpath("Not an XPath expression"),
None
);
assert_eq!(
parse_xpath("./text()"),
Some(("".to_string(), NodeAccessor::Text { recursive: false })),
);
assert_eq!(
parse_xpath(".//text()"),
Some(("".to_string(), NodeAccessor::Text { recursive: true })),
);
assert_eq!(
parse_xpath(".//div[@id='main']"),
Some(("div#main".to_string(), NodeAccessor::Node)),
);
assert_eq!(
parse_xpath("./@src"),
Some(("".to_string(), NodeAccessor::Attribute("src".to_string()))),
);
assert_eq!(
parse_xpath("@src"),
Some(("".to_string(), NodeAccessor::Attribute("src".to_string()))),
);
assert_eq!(
parse_xpath("text()"),
Some(("".to_string(), NodeAccessor::Text { recursive: false })),
);
assert_eq!(
parse_xpath(r#"//script[@type="application/ld+json"]"#),
Some(("script[type=\"application/ld+json\"]".to_string(), NodeAccessor::Node)),
)
}
}