parse_book_source/analyzer/
html.rs1use super::Analyzer;
2use anyhow::anyhow;
3use regex::Regex;
4use scraper::{Html, Selector};
5
6fn html_decode(s: &str) -> String {
7 let mut result = s.replace("&", "&");
8 result = result.replace("<", "<");
9 result = result.replace(">", ">");
10 result = result.replace(" ", " ");
11 result = result.replace("'", "'");
12 result = result.replace(""", "\"");
13 result = result.replace("<br/>", "\n");
14 result
15}
16
17fn get_html_string(html: &str) -> String {
18 let re_tags = Regex::new(r"</?(?:div|p|br|hr|h\d|article|b|dd|dl|html)[^>]*>").unwrap();
19 let re_comments = Regex::new(r"<!--[\w\W\r\n]*?-->").unwrap();
20 let mut result = re_tags.replace_all(html, "\n").to_string();
21 result = re_comments.replace_all(&result, "").to_string();
22 html_decode(&result)
23}
24
25pub struct HtmlAnalyzer {
26 content: String,
27}
28
29impl Analyzer for HtmlAnalyzer {
30 fn parse(content: &str) -> crate::Result<Self>
31 where
32 Self: Sized,
33 {
34 Ok(Self {
35 content: content.to_string(),
36 })
37 }
38
39 fn get_elements(&self, rule: &str) -> crate::Result<Vec<String>> {
40 let document = Html::parse_document(&self.content);
41 let selector = Selector::parse(rule.trim()).map_err(|e| anyhow!("{e}"))?;
42
43 Ok(document.select(&selector).map(|el| el.html()).collect())
44 }
45
46 fn get_string(&self, rule: &str) -> crate::Result<String> {
47 Ok(self.get_string_list(rule)?.join(" "))
48 }
49
50 fn get_string_list(&self, rule: &str) -> crate::Result<Vec<String>> {
51 if !rule.contains('@') {
52 return Ok(vec![self._get_result(rule, None)]);
53 }
54
55 let (selectors, last_rule) = rule.split_once('@').unwrap();
56 let document = Html::parse_document(&self.content);
57
58 if selectors.is_empty() {
59 return Ok(vec![]);
60 }
61 let selector = Selector::parse(selectors).expect("Invalid selector");
62
63 Ok(document
64 .select(&selector)
65 .map(|el| self._get_result(last_rule, Some(el.html().as_str())))
66 .collect())
67 }
68}
69
70impl HtmlAnalyzer {
71 fn _get_result(&self, last_rule: &str, html: Option<&str>) -> String {
72 let document = Html::parse_fragment(html.unwrap_or(&self.content));
73
74 match last_rule {
75 "text" => document.root_element().text().collect::<String>(),
76 "textNodes" => {
77 let selector = Selector::parse(":root > *").unwrap();
78 document
79 .select(&selector)
80 .map(|el| el.text().collect::<String>())
81 .collect::<Vec<String>>()
82 .join("\n")
83 .trim()
84 .to_string()
85 }
86 "outerHtml" => document.html(),
87 "innerHtml" => {
88 let selector = Selector::parse(":root").unwrap();
89 document
90 .select(&selector)
91 .map(|el| el.inner_html())
92 .collect::<Vec<String>>()
93 .join("\n")
94 .trim()
95 .to_string()
96 }
97 "html" => get_html_string(document.html().as_str()),
98 _ => document
99 .root_element()
100 .child_elements()
101 .next()
102 .unwrap()
103 .attr(last_rule)
104 .unwrap_or("")
105 .to_string(),
106 }
107 }
108}