parse_book_source/analyzer/
default.rs1use super::{Analyzer, HtmlAnalyzer};
2use crate::Result;
3use anyhow::anyhow;
4use regex::Regex;
5use std::{collections::HashMap, sync::LazyLock};
6
7pub struct DefaultAnalyzer {
8 analyzer: HtmlAnalyzer,
9}
10
11static CLASS_MAP: LazyLock<HashMap<&'static str, &'static str>> =
12 LazyLock::new(|| HashMap::from_iter(vec![("class", "."), ("id", "#"), ("tag", "")]));
13static RANGE_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\[(.*?)\]").unwrap());
14
15fn rule_to_selector(rule: &str) -> Result<String> {
16 let mut selectors = vec![];
17 let segments = rule.split("@").collect::<Vec<_>>();
18 let len = segments.len();
19 for (index, segment) in segments.into_iter().enumerate() {
20 if index == len - 1 && !segment.contains(".") {
21 selectors.push(format!("@{}", segment));
22 continue;
23 }
24 let mut segment = segment.trim();
25 let mut position_str = "";
26 let mut res = String::new();
27
28 if let Some(range) = RANGE_RE.find(segment) {
29 segment = &segment[..range.start()];
30 position_str = range.as_str()[1..range.as_str().len() - 1].trim();
31 }
32
33 let parts = segment.split('.').collect::<Vec<_>>();
34
35 match parts.len() {
36 1 => {
37 res.push_str(parts[0]);
38 }
39 2 => {
40 let value = parts[1];
41 let class = CLASS_MAP.get(parts[0]).unwrap_or(&"");
42 res.push_str(&format!("{}{}", class, value));
43 }
44 3 => {
45 let value = parts[1];
46 let class = CLASS_MAP.get(parts[0]).unwrap_or(&"");
47 let position = parts[2].parse::<usize>()? + 1;
48 res.push_str(&format!("{}{}:nth-of-type({})", class, value, position));
49 }
50 _ => {
51 return Err(anyhow!("Invalid rule: {}", segment).into());
52 }
53 }
54
55 if !position_str.is_empty() {
56 let mut range_res = vec![];
57 let mut is_exclude = false;
58
59 if position_str.contains("=") {
60 let (property_name, property_value) = position_str.split_once("=").unwrap();
61 res = format!(r#"{}[{}="{}"]"#, res, property_name, property_value);
62 selectors.push(res);
63 continue;
64 } else if position_str.starts_with("!") {
65 position_str = &position_str[1..];
66 is_exclude = true;
67 }
68
69 for i in position_str.split(",") {
70 if i.contains(":") {
71 let range = i.split(":").collect::<Vec<_>>();
72 let start = range[0].parse::<isize>()? + 1;
73 let end = range[1].parse::<isize>()? + 1;
74 let step = range.get(2).unwrap_or(&"");
75 range_res.push(format!(
76 ":nth-of-type({step}n+{start}):not(:nth-of-type({step}n+{end}))"
77 ));
78 } else {
79 let position = i.parse::<isize>()? + 1;
80 if position < 0 {
81 range_res.push(format!(":nth-last-of-type({})", position.abs()));
82 } else {
83 range_res.push(format!(":nth-of-type({})", position));
84 }
85 }
86 }
87
88 if is_exclude {
89 res = format!("{}:not({})", res, range_res.join(","));
90 } else {
91 res = format!("{}:is({})", res, range_res.join(","));
92 }
93 }
94 selectors.push(res);
95 }
96 Ok(selectors.join(" "))
97}
98
99impl Analyzer for DefaultAnalyzer {
100 fn parse(content: &str) -> Result<Self>
101 where
102 Self: Sized,
103 {
104 Ok(Self {
105 analyzer: HtmlAnalyzer::parse(content)?,
106 })
107 }
108
109 fn get_string(&self, rule: &str) -> Result<String> {
110 let selector = rule_to_selector(rule)?;
111 self.analyzer.get_string(&selector)
112 }
113
114 fn get_elements(&self, rule: &str) -> Result<Vec<String>> {
115 let selector = rule_to_selector(rule)?;
116 self.analyzer.get_elements(&selector)
117 }
118}
119
120#[cfg(test)]
121mod test {
122
123 use super::*;
124
125 #[test]
126 fn test_rule_to_selector() {
127 assert_eq!(
128 ".result-game-item-info p:nth-of-type(1) span:nth-of-type(2) @text",
129 rule_to_selector("class.result-game-item-info@tag.p.0@tag.span.1@text").unwrap()
130 );
131
132 assert_eq!(
133 "#intro p:nth-of-type(1) @text",
134 rule_to_selector("id.intro@tag.p.0@text").unwrap()
135 );
136
137 assert_eq!(".bookbox", rule_to_selector("class.bookbox").unwrap());
138
139 assert_eq!(
140 "#fmimg img @src",
141 rule_to_selector("id.fmimg@img@src").unwrap()
142 );
143
144 assert_eq!(
145 "[property=\"og:novel:update_time\"] @content",
146 rule_to_selector("[property=og:novel:update_time]@content").unwrap()
147 );
148
149 assert_eq!(
150 ".bookbox:is(:nth-of-type(2),:nth-of-type(5),:nth-of-type(4))",
151 rule_to_selector("class.bookbox[1,4,3]").unwrap()
152 );
153
154 assert_eq!(
155 ".bookbox:not(:nth-of-type(2),:nth-of-type(5),:nth-of-type(4))",
156 rule_to_selector("class.bookbox[!1,4,3]").unwrap()
157 );
158
159 assert_eq!(
160 ".bookbox:is(:nth-of-type(n+4):not(:nth-of-type(n+11)))",
161 rule_to_selector("class.bookbox[3:10]").unwrap()
162 );
163 }
164
165 #[test]
166 fn test_default_analyzer_get_string() {
167 let analyzer =
168 DefaultAnalyzer::parse(r#"<li><a href="/xuanhuan/">玄幻小说</a></li>"#).unwrap();
169 let res = analyzer.get_string("tag.a@href").unwrap();
170 assert_eq!(res, "/xuanhuan/");
171 }
172}