html_query_extractor/
lib.rs1use html_query_ast::Action;
2use html_query_ast::Expression;
3use markup5ever::{LocalName, Namespace, QualName};
4use scraper::{ElementRef, Html};
5use serde_json::{Map, Value};
6use std::collections::HashMap;
7
8use thiserror::Error;
9
10#[derive(Error, Debug)]
11pub enum ExpressionError {
12 #[error("Selector `{0} returned no results")]
13 EmptySelector(String),
14
15 #[error("Unexpected empty root node")]
16 EmptyRoot,
17}
18
19fn trim_whitespace(input: String) -> String {
20 input.trim().to_string()
21}
22
23fn handle_expression(
24 roots: &[ElementRef],
25 rhs: &Expression,
26 lhs: &Option<Box<Action>>,
27) -> Result<Value, ExpressionError> {
28 return match rhs {
29 Expression::Selector(selector, original_selector) => {
30 let first_root = roots.first().ok_or(ExpressionError::EmptyRoot)?;
31 let new_roots: Vec<_> = first_root.select(selector).collect();
32 let first_new_root = new_roots
33 .first()
34 .ok_or_else(|| ExpressionError::EmptySelector(original_selector.clone()))?;
35 match lhs {
36 None => Ok(Value::String(trim_whitespace(
37 first_new_root.text().collect(),
38 ))),
39 Some(lhs) => Ok(convert_to_output(lhs, &new_roots)),
40 }
41 }
42 Expression::Attribute(attr) => {
43 let first_root = roots.first().ok_or(ExpressionError::EmptyRoot)?;
44 Ok(first_root
45 .value()
46 .attrs
47 .get(&QualName::new(
48 None,
49 Namespace::from(""),
50 LocalName::from(attr.as_str()),
51 ))
52 .map_or(Value::Null, |v| {
53 Value::String(trim_whitespace(v.to_string()))
54 }))
55 }
56 Expression::Text => {
57 let first_root = roots.first().ok_or(ExpressionError::EmptyRoot)?;
58 Ok(Value::String(trim_whitespace(first_root.text().collect())))
59 }
60 Expression::Parent => {
61 let first_root = roots.first().ok_or(ExpressionError::EmptyRoot)?;
62 let parent_root = ElementRef::wrap(first_root.parent().unwrap()).unwrap();
63 match lhs {
64 None => handle_expression(&[parent_root], &Expression::Text, &None),
65 Some(lhs) => Ok(convert_to_output(lhs, &vec![parent_root])),
66 }
67 }
68 Expression::Sibling(idx) => {
69 let first_root = roots.first().ok_or(ExpressionError::EmptyRoot)?;
70 let mut next_sibling_elements = first_root
71 .next_siblings()
72 .filter(|s| s.value().is_element());
73 let chosen_sibling =
74 ElementRef::wrap(next_sibling_elements.nth(*idx - 1).unwrap()).unwrap();
75 match lhs {
76 None => handle_expression(&[chosen_sibling], &Expression::Text, &None),
77 Some(lhs) => Ok(convert_to_output(lhs, &vec![chosen_sibling])),
78 }
79 }
80 };
81}
82
83fn convert_to_output(item: &Action, roots: &Vec<ElementRef>) -> Value {
84 return match item {
85 Action::ForEachChild(hashmap) => Value::Array(
86 roots
87 .iter()
88 .map(|root| {
89 let map = hashmap
90 .iter()
91 .map(|(key, value)| (key.clone(), convert_to_output(value, &vec![*root])))
92 .collect::<Map<_, _>>();
93 Value::Object(map)
94 })
95 .collect(),
96 ),
97 Action::ForEachChildArray(action) => Value::Array(
98 roots
99 .iter()
100 .map(|root| convert_to_output(action, &vec![*root]))
101 .collect(),
102 ),
103 Action::Child(hashmap) => {
104 let map = hashmap
105 .iter()
106 .map(|(key, value)| (key.clone(), convert_to_output(value, roots)))
107 .collect::<Map<_, _>>();
108 Value::Object(map)
109 }
110 Action::Expression(rhs, lhs) => handle_expression(roots, rhs, lhs).unwrap_or(Value::Null),
111 };
112}
113
114pub fn extract(input: &str, actions: &HashMap<&str, Action>) -> Value {
115 let fragment = Html::parse_fragment(input);
116 let root = fragment.root_element();
117 let hashmap = actions
118 .iter()
119 .map(|(key, value)| (key.to_string(), convert_to_output(value, &vec![root])))
120 .collect();
121 Value::Object(hashmap)
122}