skyscraper 0.1.0

XPath for HTML web scraping
Documentation
pub mod parse;
mod tokenizer;

use std::error::Error;

use indextree::NodeId;
use crate::html::{HtmlDocument, HtmlNode, HtmlTag};

pub use crate::xpath::parse::parse;

#[derive(Debug, PartialEq)]
pub struct XpathQuery {
    pub identifier: String,
    pub predicates: Vec<XpathPredicate>
}

#[derive(Debug, PartialEq)]
pub enum XpathPredicate {
    GreaterThan { attribute: String, value: u64 },
    LessThan { attribute: String, value: u64 },
    Equals { attribute: String, value: String },
    And(Box<XpathPredicate>, Box<XpathPredicate>),
    Or(Box<XpathPredicate>, Box<XpathPredicate>),
}

impl XpathQuery {
    pub fn new(identifier: String) -> XpathQuery {
        XpathQuery {
            identifier,
            predicates: Vec::new(),
        }
    }
}

#[derive(Debug, PartialEq)]
pub enum XpathElement {
    SearchRoot,
    SearchAll,
    Query(XpathQuery)
}

pub struct Xpath {
    elements: Vec<XpathElement>
}

impl Xpath {
    pub fn apply(&self, document: &HtmlDocument) -> Result<Vec<NodeId>, Box<dyn Error>> {
        let elements = &mut self.elements.iter();
        let mut matched_nodes: Vec<NodeId> = Vec::new();
        let mut found_nodes: Vec<NodeId> = vec![document.root_key];

        while let Some(element) = elements.next() {
            match element {
                XpathElement::SearchRoot => {
                    if let Some(XpathElement::Query(query)) = elements.next() {
                        matched_nodes = search_root(query, document, &found_nodes);

                        found_nodes = Vec::new();
                        for node_id in &matched_nodes {
                            let mut children = node_id.children(&document.arena).collect();
                            found_nodes.append(&mut children);
                        }
                    }
                },
                XpathElement::SearchAll => {
                    if let Some(XpathElement::Query(query)) = elements.next() {
                        matched_nodes = search_all(query, document, &found_nodes);

                        found_nodes = Vec::new();
                        for node_id in &matched_nodes {
                            let mut children = node_id.children(&document.arena).collect();
                            found_nodes.append(&mut children);
                        }
                    }
                },
                XpathElement::Query(_) => todo!(),
            }
        }

        Ok(matched_nodes)
    }
}

pub fn search_root(query: &XpathQuery, document: &HtmlDocument, nodes: &Vec<NodeId>) -> Vec<NodeId> {
    search_internal(false, query, document, nodes)
}

pub fn search_all(query: &XpathQuery, document: &HtmlDocument, nodes: &Vec<NodeId>) -> Vec<NodeId> {
    search_internal(true, query, document, nodes)
}

fn search_internal(recursive: bool, query: &XpathQuery, document: &HtmlDocument, nodes: &Vec<NodeId>) -> Vec<NodeId> {
    let mut matches = Vec::new();
    
    for node_id in nodes.iter() {
        if let Some(node) = document.arena.get(*node_id) {
            match node.get() {
                HtmlNode::Tag(rtag) => {
                    if rtag.name == query.identifier && is_matching_predicates(query, rtag) {
                        matches.push(*node_id);
                    }

                    if recursive {
                        let children = node_id.children(&document.arena).collect();
                        let mut sub_matches = search_all(query, document, &children);
                        matches.append(&mut sub_matches);
                    }
                },
                HtmlNode::Text(_) => continue,
            }
        }
    }
    
    return matches;
}

fn is_matching_predicates(query: &XpathQuery, rtag: &HtmlTag) -> bool {
    for p in &query.predicates {
        match p {
            XpathPredicate::GreaterThan { .. } => todo!(),
            XpathPredicate::LessThan { .. } => todo!(),
            XpathPredicate::Equals { attribute, value } => {
                if !rtag.attributes.contains_key(attribute) || &rtag.attributes[attribute] != value {
                    return false;
                }
            },
            XpathPredicate::And(_, _) => todo!(),
            XpathPredicate::Or(_, _) => todo!(),
        }
    }

    return true;
}

#[cfg(test)]
mod test {
    use crate::{xpath, html};

    use super::*;

    #[test]
    fn search_root_works() {
        let text =r###"<!DOCTYPE html>
        <root>
            <a/>
            <b><a/></b>
            <c><b><a/></b></c>
        </root>
        "###;

        let document = html::parse(text).unwrap();

        let query = XpathQuery::new(String::from("root"));
        let result = search_root(&query, &document, &vec![document.root_key]);

        assert_eq!(1, result.len());
        let node = document.arena.get(result[0]).unwrap();

        match node.get() {
            HtmlNode::Tag(tag) => {
                assert_eq!("root", tag.name.as_str());
            },
            HtmlNode::Text(_) => panic!("Expected tag"),
        }
    }

    #[test]
    fn search_all_works() {
        let text =r###"<!DOCTYPE html>
        <root>
            <a/>
            <b><a/></b>
            <c><b><a/></b></c>
        </root>
        "###;

        let document = html::parse(text).unwrap();

        let query = XpathQuery::new(String::from("a"));
        let result = search_all(&query, &document, &vec![document.root_key]);

        assert_eq!(3, result.len());

        for r in result {
            let node = document.arena.get(r).unwrap();

            match node.get() {
                HtmlNode::Tag(tag) => {
                    assert_eq!("a", tag.name.as_str());
                },
                HtmlNode::Text(_) => panic!("Expected tag"),
            }
        }
    }

    #[test]
    fn search_all_should_only_select_with_matching_attribute_predicate() {
        let text =r###"<!DOCTYPE html>
        <root>
            <a/>
            <b><a/></b>
            <c><b><a hello="world"/></b></c>
        </root>
        "###;

        let document = html::parse(text).unwrap();

        let query = XpathQuery {
            identifier: String::from("a"),
            predicates: vec![
                XpathPredicate::Equals { attribute: String::from("hello"), value: String::from("world") },
            ]
        };

        let result = search_all(&query, &document, &vec![document.root_key]);

        assert_eq!(1, result.len());

        let node = document.arena.get(result[0]).unwrap();

        match node.get() {
            HtmlNode::Tag(tag) => {
                assert_eq!("a", tag.name.as_str());
                assert!(tag.attributes["hello"] == String::from("world"));
            },
            HtmlNode::Text(_) => panic!("Expected tag"),
        }
    }

    #[test]
    fn search_all_should_only_select_with_multiple_matching_attribute_predicate() {
        let text =r###"<!DOCTYPE html>
        <root>
            <a/>
            <b><a hello="world"/></b>
            <c><b><a hello="world" foo="bar"/></b></c>
        </root>
        "###;

        let document = html::parse(text).unwrap();

        let query = XpathQuery {
            identifier: String::from("a"),
            predicates: vec![
                XpathPredicate::Equals { attribute: String::from("hello"), value: String::from("world") },
                XpathPredicate::Equals { attribute: String::from("foo"), value: String::from("bar") },
            ]
        };
        
        let result = search_all(&query, &document, &vec![document.root_key]);

        assert_eq!(1, result.len());

        let node = document.arena.get(result[0]).unwrap();

        match node.get() {
            HtmlNode::Tag(tag) => {
                assert_eq!("a", tag.name.as_str());
                assert!(tag.attributes["hello"] == String::from("world"));
            },
            HtmlNode::Text(_) => panic!("Expected tag"),
        }
    }

    #[test]
    fn xpath_apply_works() {
        let text = r###"<!DOCTYPE html>
        <root>
            <node></node>
        </root>
        "###;

        let document = html::parse(text).unwrap();

        let xpath = xpath::parse::parse("/root/node").unwrap();

        let nodes = xpath.apply(&document).unwrap();

        let node = document.arena.get(nodes[0]).unwrap().get();

        match node {
            HtmlNode::Tag(t) => assert_eq!("node", t.name),
            HtmlNode::Text(_) => panic!("expected tag, got text instead"),
        }
    }
}