html_query_extractor/
lib.rs

1use html_query_ast::Action;
2use html_query_ast::Expression;
3use markup5ever::{LocalName, Namespace, QualName};
4use scraper::{ElementRef, Html};
5use serde_json::{Map, Value};
6use std::collections::HashMap;
7
8use thiserror::Error;
9
10#[derive(Error, Debug)]
11pub enum ExpressionError {
12    #[error("Selector `{0} returned no results")]
13    EmptySelector(String),
14
15    #[error("Unexpected empty root node")]
16    EmptyRoot,
17}
18
19fn trim_whitespace(input: String) -> String {
20    input.trim().to_string()
21}
22
23fn handle_expression(
24    roots: &[ElementRef],
25    rhs: &Expression,
26    lhs: &Option<Box<Action>>,
27) -> Result<Value, ExpressionError> {
28    return match rhs {
29        Expression::Selector(selector, original_selector) => {
30            let first_root = roots.first().ok_or(ExpressionError::EmptyRoot)?;
31            let new_roots: Vec<_> = first_root.select(selector).collect();
32            let first_new_root = new_roots
33                .first()
34                .ok_or_else(|| ExpressionError::EmptySelector(original_selector.clone()))?;
35            match lhs {
36                None => Ok(Value::String(trim_whitespace(
37                    first_new_root.text().collect(),
38                ))),
39                Some(lhs) => Ok(convert_to_output(lhs, &new_roots)),
40            }
41        }
42        Expression::Attribute(attr) => {
43            let first_root = roots.first().ok_or(ExpressionError::EmptyRoot)?;
44            Ok(first_root
45                .value()
46                .attrs
47                .get(&QualName::new(
48                    None,
49                    Namespace::from(""),
50                    LocalName::from(attr.as_str()),
51                ))
52                .map_or(Value::Null, |v| {
53                    Value::String(trim_whitespace(v.to_string()))
54                }))
55        }
56        Expression::Text => {
57            let first_root = roots.first().ok_or(ExpressionError::EmptyRoot)?;
58            Ok(Value::String(trim_whitespace(first_root.text().collect())))
59        }
60        Expression::Parent => {
61            let first_root = roots.first().ok_or(ExpressionError::EmptyRoot)?;
62            let parent_root = ElementRef::wrap(first_root.parent().unwrap()).unwrap();
63            match lhs {
64                None => handle_expression(&[parent_root], &Expression::Text, &None),
65                Some(lhs) => Ok(convert_to_output(lhs, &vec![parent_root])),
66            }
67        }
68        Expression::Sibling(idx) => {
69            let first_root = roots.first().ok_or(ExpressionError::EmptyRoot)?;
70            let mut next_sibling_elements = first_root
71                .next_siblings()
72                .filter(|s| s.value().is_element());
73            let chosen_sibling =
74                ElementRef::wrap(next_sibling_elements.nth(*idx - 1).unwrap()).unwrap();
75            match lhs {
76                None => handle_expression(&[chosen_sibling], &Expression::Text, &None),
77                Some(lhs) => Ok(convert_to_output(lhs, &vec![chosen_sibling])),
78            }
79        }
80    };
81}
82
83fn convert_to_output(item: &Action, roots: &Vec<ElementRef>) -> Value {
84    return match item {
85        Action::ForEachChild(hashmap) => Value::Array(
86            roots
87                .iter()
88                .map(|root| {
89                    let map = hashmap
90                        .iter()
91                        .map(|(key, value)| (key.clone(), convert_to_output(value, &vec![*root])))
92                        .collect::<Map<_, _>>();
93                    Value::Object(map)
94                })
95                .collect(),
96        ),
97        Action::ForEachChildArray(action) => Value::Array(
98            roots
99                .iter()
100                .map(|root| convert_to_output(action, &vec![*root]))
101                .collect(),
102        ),
103        Action::Child(hashmap) => {
104            let map = hashmap
105                .iter()
106                .map(|(key, value)| (key.clone(), convert_to_output(value, roots)))
107                .collect::<Map<_, _>>();
108            Value::Object(map)
109        }
110        Action::Expression(rhs, lhs) => handle_expression(roots, rhs, lhs).unwrap_or(Value::Null),
111    };
112}
113
114pub fn extract(input: &str, actions: &HashMap<&str, Action>) -> Value {
115    let fragment = Html::parse_fragment(input);
116    let root = fragment.root_element();
117    let hashmap = actions
118        .iter()
119        .map(|(key, value)| (key.to_string(), convert_to_output(value, &vec![root])))
120        .collect();
121    Value::Object(hashmap)
122}