rust-pickaxe 0.5.5

HTML data extraction library
Documentation
use std::sync::LazyLock;
use regex::Regex;


#[allow(dead_code)]
static XPATH_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
    Regex::new(r#"(?x)                                   # Enable verbose mode
    (?P<node>                                               # Start of 'node' named capture group
        (                                                   # Start of main alternation
            ^id\(["']?                                      # Match '^id(' or '^id["']
                (?P<idvalue>                                # Start of 'idvalue' named capture group
                    \s*[\w/:]                               # Optional whitespace followed by a word character, '/', or ':'
                    [-/\w\s,:;.+]*                          # Match allowed characters
                )                                           # End of 'idvalue' group
            ["']?\)                                         # Optional closing quote and parenthesis
        |                                                   # OR
            (?P<attrnav>//?)                                # 'attrnav' named capture group: '/' or '//'
            (?P<attraccessor>                               # 'attraccessor' named capture group
                @\w+[-_\w]*                                 # Literal '@' followed by a word character and allowed characters
            )                                               # End of 'attraccessor' group
        |                                                   # OR
            (?P<textnav>//?)                                # 'textnav' named capture group: '/' or '//'
            (?P<textaccessor>                               # 'textaccessor' named capture group
                text\(\)                                    # Literal 'text()'
            )                                               # End of 'textaccessor' group
        |                                                   # OR
            (?P<tagnav>//?)                                 # 'tagnav' named capture group: '/' or '//'
            (?P<tag>                                        # Start of 'tag' named capture group
                ([a-zA-Z][a-zA-Z0-9]{0,10}|\*)              # Match a tag with 1-11 alphanumerics or '*'
            )                                               # End of 'tag' group
            (                                               # Start of optional attributes
                \[                                          # Literal '['
                    (                                       # Start of attribute alternation
                        (?P<matched>                        # 'matched' named capture group
                            (?P<mattr>@?[.a-zA-Z_:]         # 'mattr' named capture group
                                [-\w:.]*(\(\))?             # Match allowed characters and optional '()'
                            )                               # End of 'mattr' group
                            =["']                           # Equal sign followed by a quote
                            (?P<mvalue>                     # 'mvalue' named capture group
                                \s*[\w/:]                   # Optional whitespace and word character, '/', or ':'
                                [-/\w\s,:;.+]*               # Match allowed characters
                            )                               # End of 'mvalue' group
                            ["']                            # Closing quote
                        )                                   # End of 'matched' group
                    |                                       # OR
                        (?P<contained>                      # 'contained' named capture group
                            contains\(                      # Literal 'contains('
                                (?P<cattr>@?[.a-zA-Z_:]     # 'cattr' named capture group
                                    [-\w:.]*(\(\))?         # Match allowed characters and optional '()'
                                ),                          # Comma
                                \s*["']                     # Optional whitespace, and quote
                                (?P<cvalue>                 # 'cvalue' named capture group
                                    \s*[\w/:]               # Optional whitespace and word character, '/', or ':'
                                    [-/\w\s,:;.+]*           # Match allowed characters
                                )                           # End of 'cvalue' group
                                ["']                        # Closing quote
                            \)                              # Literal ')'
                        )                                   # End of 'contained' group
                    )                                       # End of attribute alternation
                \]                                          # Literal ']'
            )?                                              # End of optional attributes
            (\[(?P<nth>\d|last\(\)|first\(\))\])?           # Optional '[nth|last()|first()]' named capture group 
        )                                                   # End of main alternation
    )"#)
    .expect("Invalid regex pattern")
});

/// The type of node accessor requested in the XPath expression.
#[derive(Debug, PartialEq)]
pub enum NodeAccessor {
    Attribute(String),
    Text { recursive: bool },
    Node,
}

/// Convert an XPath expression to a CSS selector.
/// 
/// * `xpath` - The XPath expression to convert.
pub fn parse_xpath<S: AsRef<str>>(xpath: S) -> Option<(String, NodeAccessor)> {
    let xpath = xpath
        .as_ref()
        .trim();
    let reg = &*XPATH_PATTERN;

    let normalized_xpath = if xpath.starts_with("/") || xpath.starts_with(".") {
        xpath
    } else {
        &("./".to_string() + xpath)
    };

    let mut selector = String::new();
    let mut position = 0;
    let mut accessor = NodeAccessor::Node;

    while position < normalized_xpath.len() {
        let caps = reg.captures(&normalized_xpath[position..])?;
        let found = reg.find(&normalized_xpath[position..])?;

        if let Some(attr_match) = caps.name("attraccessor") {
            accessor = NodeAccessor::Attribute(
                attr_match
                    .as_str()
                    .trim_start_matches("@")
                    .to_string()
                );
            break;
        }

        if caps.name("textaccessor").is_some() {
            accessor = NodeAccessor::Text {
                recursive: caps.name("textnav")?.as_str() == "//"
            };
            break;
        }

        let nav = if position == 0 {
            ""
        } else if caps.name("tagnav")?.as_str() == "//" {
            " > "
        } else {
            " "
        };

        let tag_name = caps.name("tag").map_or("", |m| m.as_str());
        let tag = if tag_name == "*" { "" } else { tag_name };

        let idvalue   = caps.name("idvalue");
        let matched   = caps.name("matched");
        let contained = caps.name("contained");

        let attr = match (idvalue, matched, contained) {
            (Some(id), _, _) => format!("#{}", id.as_str().replace(' ', "#")),

            (_, Some(_), _) => {
                let mattr  = caps.name("mattr")?.as_str();
                let mvalue = caps.name("mvalue")?.as_str();

                match mattr {
                    "@id"    => format!("#{}", mvalue.replace(' ', "#")),
                    "@class" => format!(".{}", mvalue.replace(' ', ".")),
                    "text()" | "." => format!(":contains(^{}$)", mvalue),
                    _ if !mattr.is_empty() => {
                        format!("[{}={}]", mattr.replace('@', ""), format!("\"{}\"", mvalue.replace('"', "\\\"")))
                    }
                    _ => String::new(),
                }
            }

            (_, _, Some(_)) => {
                let cattr  = caps.name("cattr")?.as_str();
                let cvalue = caps.name("cvalue")?.as_str();

                if cattr.starts_with('@') {
                    format!("[{}*={}]", cattr.replace('@', ""), cvalue)
                } else if cattr == "text()" {
                    format!(":contains({})", cvalue)
                } else {
                    String::new()
                }
            }

            _ => String::new(),
        };

        let nth = caps.name("nth")
            .map_or(String::new(), |n| match n.as_str() {
                "last()" => ":last-of-type".to_string(),
                "first()" => ":first-of-type".to_string(),
                idx if idx.chars().all(|c| c.is_ascii_digit()) => format!(":nth-of-type({})", idx),
                _ => String::new(),
            });

        selector.push_str(&format!("{}{}{}{}", nav, tag, attr, nth));
        position += found.end();
    }

    Some((selector, accessor))
}


#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_parse_xpath() {
        assert_eq!(
            parse_xpath("//meta[@property='og:description']"),
            Some(("meta[property=\"og:description\"]".to_string(), NodeAccessor::Node))
        );
        assert_eq!(
            parse_xpath(r#"//meta[@property="og:description"]"#),
            Some(("meta[property=\"og:description\"]".to_string(), NodeAccessor::Node))
        );
        assert_eq!(
            parse_xpath("/html/body/div[1]"),
            Some(("html body div:nth-of-type(1)".to_string(), NodeAccessor::Node))
        );
        assert_eq!(
            parse_xpath("/html/body/div[last()]"),
            Some(("html body div:last-of-type".to_string(), NodeAccessor::Node))
        );
        assert_eq!(
            parse_xpath("/html/body/div[first()]"),
            Some(("html body div:first-of-type".to_string(), NodeAccessor::Node))
        );
        assert_eq!(
            parse_xpath("//div[@id='main']"),
            Some(("div#main".to_string(), NodeAccessor::Node))
        );
        assert_eq!(
            parse_xpath("//div[contains(@class, 'content')]"),
            Some(("div[class*=content]".to_string(), NodeAccessor::Node))
        );
        assert_eq!(
            parse_xpath("//div[contains(text(), 'Hello')]"),
            Some(("div:contains(Hello)".to_string(), NodeAccessor::Node))
        );
        assert_eq!(
            parse_xpath("//div[@class='content']//div//li[@id='name']"),
            Some(("div.content > div > li#name".to_string(), NodeAccessor::Node))
        );
        assert_eq!(
            parse_xpath("//div[@id='main']//@src"),
            Some(("div#main".to_string(), NodeAccessor::Attribute("src".to_string())))
        );
        assert_eq!(
            parse_xpath("//span[@class='name']/@data-name"),
            Some(("span.name".to_string(), NodeAccessor::Attribute("data-name".to_string())))
        );
        assert_eq!(
            parse_xpath("/html/text()"),
            Some(("html".to_string(), NodeAccessor::Text { recursive: false }))
        );
        assert_eq!(
            parse_xpath("//html//div//text()"),
            Some(("html > div".to_string(), NodeAccessor::Text { recursive: true }))
        );
        assert_eq!(
            parse_xpath("//div[contains(text(), 'Hello')]//text()"),
            Some(("div:contains(Hello)".to_string(), NodeAccessor::Text { recursive: true }))
        );
        assert_eq!(
            parse_xpath("Not an XPath expression"),
            None
        );
        assert_eq!(
            parse_xpath("./text()"),
            Some(("".to_string(), NodeAccessor::Text { recursive: false })),
        );
        assert_eq!(
            parse_xpath(".//text()"),
            Some(("".to_string(), NodeAccessor::Text { recursive: true })),
        );
        assert_eq!(
            parse_xpath(".//div[@id='main']"),
            Some(("div#main".to_string(), NodeAccessor::Node)),
        );
        assert_eq!(
            parse_xpath("./@src"),
            Some(("".to_string(), NodeAccessor::Attribute("src".to_string()))),
        );
        assert_eq!(
            parse_xpath("@src"),
            Some(("".to_string(), NodeAccessor::Attribute("src".to_string()))),
        );
        assert_eq!(
            parse_xpath("text()"),
            Some(("".to_string(), NodeAccessor::Text { recursive: false })),
        );
        assert_eq!(
            parse_xpath(r#"//script[@type="application/ld+json"]"#),
            Some(("script[type=\"application/ld+json\"]".to_string(), NodeAccessor::Node)),
        )
    }
}