parse_book_source/analyzer/
default.rs

1use super::{Analyzer, HtmlAnalyzer};
2use crate::Result;
3use anyhow::anyhow;
4use regex::Regex;
5use std::{collections::HashMap, sync::LazyLock};
6
7pub struct DefaultAnalyzer {
8    analyzer: HtmlAnalyzer,
9}
10
11static CLASS_MAP: LazyLock<HashMap<&'static str, &'static str>> =
12    LazyLock::new(|| HashMap::from_iter(vec![("class", "."), ("id", "#"), ("tag", "")]));
13static RANGE_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\[(.*?)\]").unwrap());
14
15fn rule_to_selector(rule: &str) -> Result<String> {
16    let mut selectors = vec![];
17    let segments = rule.split("@").collect::<Vec<_>>();
18    let len = segments.len();
19    for (index, segment) in segments.into_iter().enumerate() {
20        if index == len - 1 && !segment.contains(".") {
21            selectors.push(format!("@{}", segment));
22            continue;
23        }
24        let mut segment = segment.trim();
25        let mut position_str = "";
26        let mut res = String::new();
27
28        if let Some(range) = RANGE_RE.find(segment) {
29            segment = &segment[..range.start()];
30            position_str = range.as_str()[1..range.as_str().len() - 1].trim();
31        }
32
33        let parts = segment.split('.').collect::<Vec<_>>();
34
35        match parts.len() {
36            1 => {
37                res.push_str(parts[0]);
38            }
39            2 => {
40                let value = parts[1];
41                let class = CLASS_MAP.get(parts[0]).unwrap_or(&"");
42                res.push_str(&format!("{}{}", class, value));
43            }
44            3 => {
45                let value = parts[1];
46                let class = CLASS_MAP.get(parts[0]).unwrap_or(&"");
47                let position = parts[2].parse::<usize>()? + 1;
48                res.push_str(&format!("{}{}:nth-of-type({})", class, value, position));
49            }
50            _ => {
51                return Err(anyhow!("Invalid rule: {}", segment).into());
52            }
53        }
54
55        if !position_str.is_empty() {
56            let mut range_res = vec![];
57            let mut is_exclude = false;
58
59            if position_str.contains("=") {
60                let (property_name, property_value) = position_str.split_once("=").unwrap();
61                res = format!(r#"{}[{}="{}"]"#, res, property_name, property_value);
62                selectors.push(res);
63                continue;
64            } else if position_str.starts_with("!") {
65                position_str = &position_str[1..];
66                is_exclude = true;
67            }
68
69            for i in position_str.split(",") {
70                if i.contains(":") {
71                    let range = i.split(":").collect::<Vec<_>>();
72                    let start = range[0].parse::<isize>()? + 1;
73                    let end = range[1].parse::<isize>()? + 1;
74                    let step = range.get(2).unwrap_or(&"");
75                    range_res.push(format!(
76                        ":nth-of-type({step}n+{start}):not(:nth-of-type({step}n+{end}))"
77                    ));
78                } else {
79                    let position = i.parse::<isize>()? + 1;
80                    if position < 0 {
81                        range_res.push(format!(":nth-last-of-type({})", position.abs()));
82                    } else {
83                        range_res.push(format!(":nth-of-type({})", position));
84                    }
85                }
86            }
87
88            if is_exclude {
89                res = format!("{}:not({})", res, range_res.join(","));
90            } else {
91                res = format!("{}:is({})", res, range_res.join(","));
92            }
93        }
94        selectors.push(res);
95    }
96    Ok(selectors.join(" "))
97}
98
99impl Analyzer for DefaultAnalyzer {
100    fn parse(content: &str) -> Result<Self>
101    where
102        Self: Sized,
103    {
104        Ok(Self {
105            analyzer: HtmlAnalyzer::parse(content)?,
106        })
107    }
108
109    fn get_string(&self, rule: &str) -> Result<String> {
110        let selector = rule_to_selector(rule)?;
111        self.analyzer.get_string(&selector)
112    }
113
114    fn get_elements(&self, rule: &str) -> Result<Vec<String>> {
115        let selector = rule_to_selector(rule)?;
116        self.analyzer.get_elements(&selector)
117    }
118}
119
120#[cfg(test)]
121mod test {
122
123    use super::*;
124
125    #[test]
126    fn test_rule_to_selector() {
127        assert_eq!(
128            ".result-game-item-info p:nth-of-type(1) span:nth-of-type(2) @text",
129            rule_to_selector("class.result-game-item-info@tag.p.0@tag.span.1@text").unwrap()
130        );
131
132        assert_eq!(
133            "#intro p:nth-of-type(1) @text",
134            rule_to_selector("id.intro@tag.p.0@text").unwrap()
135        );
136
137        assert_eq!(".bookbox", rule_to_selector("class.bookbox").unwrap());
138
139        assert_eq!(
140            "#fmimg img @src",
141            rule_to_selector("id.fmimg@img@src").unwrap()
142        );
143
144        assert_eq!(
145            "[property=\"og:novel:update_time\"] @content",
146            rule_to_selector("[property=og:novel:update_time]@content").unwrap()
147        );
148
149        assert_eq!(
150            ".bookbox:is(:nth-of-type(2),:nth-of-type(5),:nth-of-type(4))",
151            rule_to_selector("class.bookbox[1,4,3]").unwrap()
152        );
153
154        assert_eq!(
155            ".bookbox:not(:nth-of-type(2),:nth-of-type(5),:nth-of-type(4))",
156            rule_to_selector("class.bookbox[!1,4,3]").unwrap()
157        );
158
159        assert_eq!(
160            ".bookbox:is(:nth-of-type(n+4):not(:nth-of-type(n+11)))",
161            rule_to_selector("class.bookbox[3:10]").unwrap()
162        );
163    }
164
165    #[test]
166    fn test_default_analyzer_get_string() {
167        let analyzer =
168            DefaultAnalyzer::parse(r#"<li><a href="/xuanhuan/">玄幻小说</a></li>"#).unwrap();
169        let res = analyzer.get_string("tag.a@href").unwrap();
170        assert_eq!(res, "/xuanhuan/");
171    }
172}