progscrape_scrapers/backends/utils/
html.rs

1use tl::{HTMLTag, NodeHandle, Parser};
2
3/// Takes an Option<QuerySelectorIterator> and makes it return a stream of nodes.
4pub fn html_tag_iterator<'a, T: IntoIterator<Item = NodeHandle> + 'a>(
5    p: &'a Parser<'a>,
6    it: Option<T>,
7) -> impl Iterator<Item = &'a HTMLTag> + 'a {
8    let it = Iterator::flatten(it.into_iter().map(|x| x.into_iter()));
9    it.filter_map(|node| node.get(p).and_then(|node| node.as_tag()))
10}
11
12/// Find the first child node matching the selector.
13pub fn find_first<'a>(
14    p: &'a Parser<'a>,
15    parent: &'a HTMLTag,
16    selector: &'static str,
17) -> Option<&'a HTMLTag<'a>> {
18    html_tag_iterator(p, parent.query_selector(p, selector)).next()
19}
20
21pub fn get_attribute<'a>(
22    _p: &'a Parser<'a>,
23    parent: &'a HTMLTag,
24    attribute: &'static str,
25) -> Option<String> {
26    parent
27        .attributes()
28        .get(attribute)
29        .unwrap_or_default()
30        .map(|f| f.as_utf8_str().into())
31}
32
33/// This method will unescape standard HTML entities. It is limited to a subset of the most common entities and the decimal/hex
34/// escapes for arbitrary characters. It will attempt to pass through any entity that doesn't match.
35pub fn unescape_entities(input: &str) -> String {
36    const ENTITIES: [(&str, &str); 6] = [
37        ("amp", "&"),
38        ("lt", "<"),
39        ("gt", ">"),
40        ("quot", "\""),
41        ("squot", "'"),
42        ("nbsp", "\u{00a0}"),
43    ];
44    let mut s = String::new();
45    let mut entity = false;
46    let mut entity_name = String::new();
47    'char: for c in input.chars() {
48        if entity {
49            if c == ';' {
50                entity = false;
51                if entity_name.starts_with("#x") {
52                    if let Ok(n) = u32::from_str_radix(&entity_name[2..entity_name.len()], 16) {
53                        if let Some(c) = char::from_u32(n) {
54                            s.push(c);
55                            entity_name.clear();
56                            continue 'char;
57                        }
58                    }
59                } else if entity_name.starts_with('#') {
60                    if let Ok(n) = u32::from_str_radix(&entity_name[1..entity_name.len()], 10) {
61                        if let Some(c) = char::from_u32(n) {
62                            s.push(c);
63                            entity_name.clear();
64                            continue 'char;
65                        }
66                    }
67                } else {
68                    for (name, value) in ENTITIES {
69                        if entity_name == name {
70                            s += value;
71                            entity_name.clear();
72                            continue 'char;
73                        }
74                    }
75                }
76                s += &format!("&{};", entity_name);
77                entity_name.clear();
78                continue 'char;
79            }
80            entity_name.push(c);
81        } else if c == '&' {
82            entity = true;
83        } else {
84            s.push(c);
85        }
86    }
87    if !entity_name.is_empty() {
88        s += &format!("&{}", entity_name);
89    }
90    s
91}
92
93#[cfg(test)]
94mod test {
95    use super::*;
96    use rstest::*;
97
98    #[rstest]
99    #[case("a b", "a b")]
100    #[case("a&amp;b", "a&b")]
101    #[case("a&#x27;b", "a'b")]
102    #[case("a&#160;b", "a\u{00a0}b")]
103    #[case("a&squot;&quot;b", "a'\"b")]
104    fn test_unescape(#[case] a: &str, #[case] b: &str) {
105        assert_eq!(unescape_entities(a), b.to_owned());
106    }
107
108    #[rstest]
109    #[case("a&amp")]
110    #[case("a&fake;")]
111    #[case("a?a=b&b=c")]
112    fn test_bad_escape(#[case] a: &str) {
113        assert_eq!(unescape_entities(a), a.to_owned());
114    }
115}