parse_book_source/analyzer/
html.rs

1use super::Analyzer;
2use anyhow::anyhow;
3use regex::Regex;
4use scraper::{Html, Selector};
5
6fn html_decode(s: &str) -> String {
7    let mut result = s.replace("&", "&");
8    result = result.replace("&lt;", "<");
9    result = result.replace("&gt;", ">");
10    result = result.replace("&nbsp;", " ");
11    result = result.replace("&#39;", "'");
12    result = result.replace("&quot;", "\"");
13    result = result.replace("<br/>", "\n");
14    result
15}
16
17fn get_html_string(html: &str) -> String {
18    let re_tags = Regex::new(r"</?(?:div|p|br|hr|h\d|article|b|dd|dl|html)[^>]*>").unwrap();
19    let re_comments = Regex::new(r"<!--[\w\W\r\n]*?-->").unwrap();
20    let mut result = re_tags.replace_all(html, "\n").to_string();
21    result = re_comments.replace_all(&result, "").to_string();
22    html_decode(&result)
23}
24
25pub struct HtmlAnalyzer {
26    content: String,
27}
28
29impl Analyzer for HtmlAnalyzer {
30    fn parse(content: &str) -> crate::Result<Self>
31    where
32        Self: Sized,
33    {
34        Ok(Self {
35            content: content.to_string(),
36        })
37    }
38
39    fn get_elements(&self, rule: &str) -> crate::Result<Vec<String>> {
40        let document = Html::parse_document(&self.content);
41        let selector = Selector::parse(rule.trim()).map_err(|e| anyhow!("{e}"))?;
42
43        Ok(document.select(&selector).map(|el| el.html()).collect())
44    }
45
46    fn get_string(&self, rule: &str) -> crate::Result<String> {
47        Ok(self.get_string_list(rule)?.join("  "))
48    }
49
50    fn get_string_list(&self, rule: &str) -> crate::Result<Vec<String>> {
51        if !rule.contains('@') {
52            return Ok(vec![self._get_result(rule, None)]);
53        }
54
55        let (selectors, last_rule) = rule.split_once('@').unwrap();
56        let document = Html::parse_document(&self.content);
57
58        if selectors.is_empty() {
59            return Ok(vec![]);
60        }
61        let selector = Selector::parse(selectors).expect("Invalid selector");
62
63        Ok(document
64            .select(&selector)
65            .map(|el| self._get_result(last_rule, Some(el.html().as_str())))
66            .collect())
67    }
68}
69
70impl HtmlAnalyzer {
71    fn _get_result(&self, last_rule: &str, html: Option<&str>) -> String {
72        let document = Html::parse_fragment(html.unwrap_or(&self.content));
73
74        match last_rule {
75            "text" => document.root_element().text().collect::<String>(),
76            "textNodes" => {
77                let selector = Selector::parse(":root > *").unwrap();
78                document
79                    .select(&selector)
80                    .map(|el| el.text().collect::<String>())
81                    .collect::<Vec<String>>()
82                    .join("\n")
83                    .trim()
84                    .to_string()
85            }
86            "outerHtml" => document.html(),
87            "innerHtml" => {
88                let selector = Selector::parse(":root").unwrap();
89                document
90                    .select(&selector)
91                    .map(|el| el.inner_html())
92                    .collect::<Vec<String>>()
93                    .join("\n")
94                    .trim()
95                    .to_string()
96            }
97            "html" => get_html_string(document.html().as_str()),
98            _ => document
99                .root_element()
100                .child_elements()
101                .next()
102                .unwrap()
103                .attr(last_rule)
104                .unwrap_or("")
105                .to_string(),
106        }
107    }
108}