parse_book_source/analyzer/
analyzer_manager.rs

1use std::{collections::HashMap, sync::LazyLock};
2
3use super::{Analyzer, AnalyzerType, Analyzers, SingleRule, json::value_to_string};
4use crate::{Result, utils::replace_all};
5use anyhow::anyhow;
6use regex::Regex;
7use serde_json::Value;
8
9static SPLIT_RULE: LazyLock<Regex> = LazyLock::new(|| {
10    Regex::new(
11        r"@css:|@json:|@http:|@xpath:|@match:|@regex:|@regexp:|@replace:|@encode:|@decode:|^",
12    )
13    .unwrap()
14});
15static EXPRESSION: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\{\{(.+?)\}\}").unwrap());
16static PUT_RULE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"@put:\{(.+?):(.+?)\}").unwrap());
17static GET_RULE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"@get:\{(.+?)\}").unwrap());
18
19#[derive(Debug, Clone)]
20pub struct AnalyzerManager {
21    pub analyzers: Vec<Analyzers>,
22    pub variables: HashMap<String, String>,
23}
24
25impl AnalyzerManager {
26    pub fn new() -> Result<Self> {
27        Ok(Self {
28            analyzers: vec![
29                Analyzers::new(r"^@css:", None, AnalyzerType::Html)?,
30                Analyzers::new(r"^@json:|^\$", Some(r"^@json:"), AnalyzerType::JsonPath)?,
31                Analyzers::new("", None, AnalyzerType::Default)?,
32            ],
33            variables: HashMap::new(),
34        })
35    }
36
37    pub fn set<T: ToString>(&mut self, key: &str, value: T) {
38        self.variables.insert(key.to_string(), value.to_string());
39    }
40
41    pub fn get_analyzer(&self, rule: &str) -> &Analyzers {
42        self.analyzers
43            .iter()
44            .find(|a| a.pattern.is_match(rule.trim()))
45            .unwrap()
46    }
47
48    pub fn split_rule_resolve(&self, rule: &str) -> Result<Vec<SingleRule>> {
49        let rule_match = SPLIT_RULE.find_iter(rule).collect::<Vec<_>>();
50        let mut rule_list: Vec<SingleRule> = vec![];
51        let mut end = rule.len();
52
53        for i in rule_match.iter().rev() {
54            let mut r = rule[i.start()..end].to_string();
55            end = i.start();
56
57            let analyzer = self.get_analyzer(&r);
58
59            r = analyzer
60                .replace
61                .as_ref()
62                .unwrap_or(&analyzer.pattern)
63                .replace(&r, "")
64                .to_string();
65
66            if let Some(index) = r.find("##") {
67                // 按## 分割
68                let (r, replace) = r.split_at(index);
69
70                rule_list.push(SingleRule::new(
71                    r,
72                    // 去掉 ##
73                    Some(&replace[2..]),
74                    analyzer.analyzer.clone(),
75                )?);
76            } else {
77                rule_list.push(SingleRule::new(&r, None, analyzer.analyzer.clone())?);
78            }
79        }
80
81        rule_list.reverse();
82        Ok(rule_list)
83    }
84
85    fn _get_elements(analyzer: &dyn Analyzer, rule: &str) -> Result<Vec<String>> {
86        if rule.contains("&&") {
87            let mut res = vec![];
88            for simple_rule in rule.split("&&") {
89                let mut r = Self::_get_elements(analyzer, simple_rule)?;
90
91                if !r.is_empty() {
92                    res.append(&mut r);
93                }
94            }
95            return Ok(res);
96        } else if rule.contains("||") {
97            for simple_rule in rule.split("||") {
98                let r = Self::_get_elements(analyzer, simple_rule)?;
99
100                if !r.is_empty() {
101                    return Ok(r);
102                };
103            }
104        }
105        analyzer.get_elements(rule)
106    }
107
108    pub fn get_element(&self, rule: &str, data: &str) -> Result<Vec<String>> {
109        let mut temp = data.to_string();
110
111        for single_rule in self.split_rule_resolve(rule)? {
112            let analyzer = single_rule.analyzer.parse_to_analyzer(&temp)?;
113            temp = Self::_get_elements(analyzer.as_ref(), &single_rule.rule)?
114                .join("_______split_______");
115        }
116
117        Ok(temp
118            .split("_______split_______")
119            .map(|s| s.to_string())
120            .collect())
121    }
122
123    fn _get_string(
124        single_rule: &SingleRule,
125        analyzer: &dyn Analyzer,
126        rule: &str,
127    ) -> Result<String> {
128        let mut result = String::new();
129
130        if rule.contains("&&") {
131            let mut res = vec![];
132            for simple_rule in rule.split("&&") {
133                let r = Self::_get_string(single_rule, analyzer, simple_rule)?;
134
135                if !r.is_empty() {
136                    res.push(r);
137                }
138            }
139
140            return Ok(res.join("  "));
141        } else if rule.contains("||") {
142            for simple_rule in rule.split("||") {
143                let r = Self::_get_string(single_rule, analyzer, simple_rule)?;
144
145                if !r.is_empty() {
146                    return Ok(r);
147                };
148            }
149        } else {
150            result = analyzer.get_string(rule)?.trim().to_string()
151        }
152
153        if result.is_empty() {
154            Ok(result)
155        } else {
156            Ok(single_rule.replace_content(&result)?)
157        }
158    }
159
160    fn put_variable(&mut self, rule: &str, data: &str) -> Result<String> {
161        replace_all(&PUT_RULE, rule, |capture| {
162            let key = capture
163                .get(1)
164                .ok_or(anyhow!("key is not found"))?
165                .as_str()
166                .trim();
167
168            let sub_rule = capture
169                .get(2)
170                .ok_or(anyhow!("value rule is not found"))?
171                .as_str()
172                .trim();
173
174            let v = self.get_string(sub_rule, data, None)?;
175            self.variables.insert(key.to_string(), v);
176            Ok("".into())
177        })
178    }
179
180    fn get_variable(&self, rule: &str) -> Result<String> {
181        replace_all(&GET_RULE, rule, |capture| {
182            let key = capture
183                .get(1)
184                .ok_or(anyhow!("key is not found"))?
185                .as_str()
186                .trim();
187
188            let v = self
189                .variables
190                .get(key)
191                .ok_or(anyhow!("the value of key {} is not found", key))?;
192
193            Ok(v.to_string())
194        })
195    }
196
197    pub fn get_string(&mut self, rule: &str, data: &str, extra: Option<Value>) -> Result<String> {
198        if rule.is_empty() {
199            return Ok("".to_string());
200        }
201
202        // 处理put
203        let new_rule = self.put_variable(rule, data)?;
204
205        // 处理get
206        let new_rule = self.get_variable(&new_rule)?;
207
208        // 处理表达式
209        let p_left = new_rule.rfind("{{");
210        let p_right = new_rule.rfind("}}");
211
212        if let Some(left) = p_left
213            && let Some(right) = p_right
214            && left < right
215        {
216            return replace_all(&EXPRESSION, &new_rule, |captures| {
217                let sub_rule = captures.get(1).map(|m| m.as_str().trim()).unwrap_or("");
218                if extra.is_some()
219                    && let Some(extra_value) = extra.as_ref().unwrap().get(sub_rule)
220                {
221                    return value_to_string(extra_value);
222                }
223                self.get_string(sub_rule, data, None)
224            });
225        }
226
227        // 处理普通规则
228        let mut temp = data.to_string();
229        for single_rule in self.split_rule_resolve(&new_rule)? {
230            let analyzer = single_rule.analyzer.parse_to_analyzer(&temp)?;
231
232            temp = Self::_get_string(&single_rule, analyzer.as_ref(), &single_rule.rule)?;
233            temp = single_rule.replace_content(&temp)?;
234        }
235        Ok(temp)
236    }
237}
238
239#[cfg(test)]
240mod tests {
241    use super::*;
242    use serde_json::json;
243
244    #[test]
245    fn test_analyzer_manager() {
246        let mut analyzer_manager = AnalyzerManager::new().unwrap();
247        let data = "{\"buymessagevalue\":\"15_15\",\"chapter_id\":300,\"chapter_name\":\"第四卷 剑气近_第二百九十五章 远望\",\"chapter_size\":3253,\"coin\":15,\"coin_original\":15,\"createdate\":\"2023-04-27 23:19:25\",\"license\":1,\"money\":0.15,\"novel_bkid_crid\":\"novel_672340121_300\",\"ori_license\":1,\"txt_url\":\"\",\"zip_url\":\"\"}";
248        analyzer_manager.set("book", 123);
249        analyzer_manager.set("index", 1);
250
251        let res = analyzer_manager.get_string(
252            "https://www.xmkanshu.com/service/getContent?fr=smsstg&v=4&uid=B197589CF54DC527538FADCAE6BDBC78&urbid=%2Fbook_95_0&bkid=@get:{book}&crid={{$.chapter_id}}&pg=1",
253            data,
254            Some(json!({
255                "page":123
256            })),
257        );
258        assert_eq!(
259            res.unwrap(),
260            "https://www.xmkanshu.com/service/getContent?fr=smsstg&v=4&uid=B197589CF54DC527538FADCAE6BDBC78&urbid=%2Fbook_95_0&bkid=123&crid=300&pg=1"
261        );
262    }
263
264    #[test]
265    fn test_analyzer_manager_get_analyzer() {
266        let analyzer_manager = AnalyzerManager::new().unwrap();
267        let analyzer =
268            analyzer_manager.get_analyzer("[property=og:novel:latest_chapter_name]@content");
269        assert_eq!(analyzer.analyzer, AnalyzerType::Default);
270
271        let analyzer = analyzer_manager.get_analyzer("$.book.id##4##abc");
272        assert_eq!(analyzer.analyzer, AnalyzerType::JsonPath);
273
274        let analyzer = analyzer_manager.get_analyzer("@css:div h1 a[href]");
275        assert_eq!(analyzer.analyzer, AnalyzerType::Html);
276    }
277}