progscrape_scrapers/backends/utils/
html.rs1use tl::{HTMLTag, NodeHandle, Parser};
2
3pub fn html_tag_iterator<'a, T: IntoIterator<Item = NodeHandle> + 'a>(
5 p: &'a Parser<'a>,
6 it: Option<T>,
7) -> impl Iterator<Item = &'a HTMLTag> + 'a {
8 let it = Iterator::flatten(it.into_iter().map(|x| x.into_iter()));
9 it.filter_map(|node| node.get(p).and_then(|node| node.as_tag()))
10}
11
12pub fn find_first<'a>(
14 p: &'a Parser<'a>,
15 parent: &'a HTMLTag,
16 selector: &'static str,
17) -> Option<&'a HTMLTag<'a>> {
18 html_tag_iterator(p, parent.query_selector(p, selector)).next()
19}
20
21pub fn get_attribute<'a>(
22 _p: &'a Parser<'a>,
23 parent: &'a HTMLTag,
24 attribute: &'static str,
25) -> Option<String> {
26 parent
27 .attributes()
28 .get(attribute)
29 .unwrap_or_default()
30 .map(|f| f.as_utf8_str().into())
31}
32
33pub fn unescape_entities(input: &str) -> String {
36 const ENTITIES: [(&str, &str); 6] = [
37 ("amp", "&"),
38 ("lt", "<"),
39 ("gt", ">"),
40 ("quot", "\""),
41 ("squot", "'"),
42 ("nbsp", "\u{00a0}"),
43 ];
44 let mut s = String::new();
45 let mut entity = false;
46 let mut entity_name = String::new();
47 'char: for c in input.chars() {
48 if entity {
49 if c == ';' {
50 entity = false;
51 if entity_name.starts_with("#x") {
52 if let Ok(n) = u32::from_str_radix(&entity_name[2..entity_name.len()], 16) {
53 if let Some(c) = char::from_u32(n) {
54 s.push(c);
55 entity_name.clear();
56 continue 'char;
57 }
58 }
59 } else if entity_name.starts_with('#') {
60 if let Ok(n) = u32::from_str_radix(&entity_name[1..entity_name.len()], 10) {
61 if let Some(c) = char::from_u32(n) {
62 s.push(c);
63 entity_name.clear();
64 continue 'char;
65 }
66 }
67 } else {
68 for (name, value) in ENTITIES {
69 if entity_name == name {
70 s += value;
71 entity_name.clear();
72 continue 'char;
73 }
74 }
75 }
76 s += &format!("&{};", entity_name);
77 entity_name.clear();
78 continue 'char;
79 }
80 entity_name.push(c);
81 } else if c == '&' {
82 entity = true;
83 } else {
84 s.push(c);
85 }
86 }
87 if !entity_name.is_empty() {
88 s += &format!("&{}", entity_name);
89 }
90 s
91}
92
93#[cfg(test)]
94mod test {
95 use super::*;
96 use rstest::*;
97
98 #[rstest]
99 #[case("a b", "a b")]
100 #[case("a&b", "a&b")]
101 #[case("a'b", "a'b")]
102 #[case("a b", "a\u{00a0}b")]
103 #[case("a&squot;"b", "a'\"b")]
104 fn test_unescape(#[case] a: &str, #[case] b: &str) {
105 assert_eq!(unescape_entities(a), b.to_owned());
106 }
107
108 #[rstest]
109 #[case("a&")]
110 #[case("a&fake;")]
111 #[case("a?a=b&b=c")]
112 fn test_bad_escape(#[case] a: &str) {
113 assert_eq!(unescape_entities(a), a.to_owned());
114 }
115}