Skip to main content

parse_book_source/
eval.rs

1//! 规则解释器(Interpreter + Composite)。递归遍历 [`Rule`] 求值;无字符串 DSL 解析。
2//!
3//! 两个入口:
4//! - [`eval_value`]:值规则 → 一个字符串。
5//! - [`eval_list`]:列表规则 → 多个「子上下文」内容串(每个供后续 item 规则求值)。
6
7use super::backend;
8use super::error::EvalError;
9use super::source::{CleanStep, Rule};
10use super::transform;
11use fancy_regex::Regex;
12use std::collections::HashMap;
13use std::sync::LazyLock;
14
15/// 模板插值变量表(`{{key}}` / `{{page}}` / `{{base}}` / 命名捕获)。
16pub type Vars = HashMap<String, String>;
17
18/// 对当前上下文求一个值。
19pub fn eval_value(rule: &Rule, ctx: &str, vars: &Vars) -> Result<String, EvalError> {
20    match rule {
21        Rule::Literal { literal } => Ok(literal.clone()),
22        Rule::Template { template } => Ok(interpolate(template, vars)),
23        Rule::FirstOf { first_of } => {
24            for r in first_of {
25                let v = eval_value(r, ctx, vars)?;
26                if !v.trim().is_empty() {
27                    return Ok(v);
28                }
29            }
30            Ok(String::new())
31        }
32        Rule::Concat { concat, join } => {
33            let mut parts = Vec::new();
34            for r in concat {
35                let v = eval_value(r, ctx, vars)?;
36                if !v.trim().is_empty() {
37                    parts.push(v);
38                }
39            }
40            Ok(parts.join(join))
41        }
42        Rule::Js { js } => run_js(js, ctx, vars),
43        Rule::Leaf(l) => {
44            let raw = backend::extract(l.via, ctx, l.select.as_deref(), l.index, &l.extract)?;
45            apply_clean(raw, &l.clean, vars)
46        }
47    }
48}
49
50/// 执行一段 JS(逃生舱):以 `result` 为当前上下文、注入变量 + `crypto` 助手。
51/// 未启用 `js` feature 时返回 `Unsupported("js")`(但书源仍可解析)。
52fn run_js(script: &str, result: &str, vars: &Vars) -> Result<String, EvalError> {
53    #[cfg(feature = "js")]
54    {
55        crate::js::eval_js(script, result, vars)
56    }
57    #[cfg(not(feature = "js"))]
58    {
59        let _ = (script, result, vars);
60        Err(EvalError::Unsupported("js"))
61    }
62}
63
64/// 选中所有匹配,返回各自的子上下文内容串。
65pub fn eval_list(rule: &Rule, ctx: &str) -> Result<Vec<String>, EvalError> {
66    match rule {
67        Rule::Leaf(l) => match l.select.as_deref() {
68            Some(sel) => backend::select_all(l.via, ctx, sel),
69            // 无选择器:把当前上下文作为单一项(而非把空串当非法选择器)。
70            None => Ok(vec![ctx.to_string()]),
71        },
72        Rule::FirstOf { first_of } => {
73            for r in first_of {
74                let v = eval_list(r, ctx)?;
75                if !v.is_empty() {
76                    return Ok(v);
77                }
78            }
79            Ok(Vec::new())
80        }
81        // literal/template/concat 作为列表无意义:退化为单值(若非空)。
82        other => {
83            let v = eval_value(other, ctx, &Vars::new())?;
84            Ok(if v.is_empty() { Vec::new() } else { vec![v] })
85        }
86    }
87}
88
89/// 应用清洗流水线。步内固定顺序:
90/// `regex→replace → trim → prepend → append → decode → encode → hash → cipher → fontMap → cn`。
91/// 编解码/加解密会失败(非法输入、错密钥),故返回 `Result`(显式报错,不静默空)。
92fn apply_clean(mut s: String, steps: &[CleanStep], vars: &Vars) -> Result<String, EvalError> {
93    for step in steps {
94        if let Some(pat) = &step.regex {
95            // 非法正则是配置错误,显式报错(与抽取层 regex_extract 及下方 crypto 步一致),不静默跳过。
96            let re = Regex::new(pat).map_err(|e| EvalError::Regex(e.to_string()))?;
97            let rep = step.replace.as_deref().unwrap_or("");
98            s = re.replace_all(&s, rep).into_owned();
99        }
100        if step.trim.unwrap_or(false) {
101            s = s.trim().to_string();
102        }
103        if let Some(p) = &step.prepend {
104            s = format!("{p}{s}");
105        }
106        if let Some(a) = &step.append {
107            s = format!("{s}{a}");
108        }
109        if let Some(c) = step.decode {
110            s = transform::decode(&s, c)?;
111        }
112        if let Some(c) = step.encode {
113            s = transform::encode(&s, c)?;
114        }
115        if let Some(h) = &step.hash {
116            s = transform::hash(&s, h)?;
117        }
118        if let Some(c) = &step.cipher {
119            s = transform::cipher(&s, c)?;
120        }
121        if let Some(table) = &step.font_map {
122            s = transform::font_map(&s, table)?;
123        }
124        if let Some(cn) = step.cn {
125            s = transform::cn_convert(&s, cn);
126        }
127        if let Some(js) = &step.js {
128            s = run_js(js, &s, vars)?;
129        }
130    }
131    Ok(s)
132}
133
134/// 把 `{{key}}` 替换为变量值,未知键替换为空串。
135pub(crate) fn interpolate(template: &str, vars: &Vars) -> String {
136    static RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\{\{\s*([\w.\-]+)\s*\}\}").unwrap());
137    RE.replace_all(template, |c: &fancy_regex::Captures| {
138        c.get(1)
139            .and_then(|m| vars.get(m.as_str()))
140            .cloned()
141            .unwrap_or_default()
142    })
143    .into_owned()
144}
145
146#[cfg(test)]
147mod tests {
148    use super::*;
149    use crate::source::Rule;
150
151    fn rule(j: &str) -> Rule {
152        serde_json::from_str(j).expect("rule json")
153    }
154
155    // 合成的 bilixs 式目录:.box 内 直接子 h2(卷)+ a.module-row-text(章);
156    // span 包裹的"阅读进度"应被 `.box > h2` 排除。
157    const CATALOG: &str = r#"<html><body>
158      <div class="box">
159        <span id="shuqian"><h2 class="module-title type">阅读进度</h2></span>
160        <h2 class="module-title type">第一卷 魔性不改</h2>
161        <div class="module-row-info"><a class="module-row-text" href="/n/1.html"><i></i><div class="module-row-title"><span>第一章 甲</span></div></a></div>
162        <div class="module-row-info"><a class="module-row-text" href="/n/2.html"><i></i><div class="module-row-title"><span>第二章 乙</span></div></a></div>
163        <h2 class="module-title type">第二卷 魔子出山</h2>
164        <div class="module-row-info"><a class="module-row-text" href="/n/3.html"><i></i><div class="module-row-title"><span>第三章 丙</span></div></a></div>
165      </div>
166    </body></html>"#;
167
168    fn toc_list() -> Rule {
169        rule(r#"{"via":"css","select":".box > h2.module-title.type, .box a.module-row-text"}"#)
170    }
171
172    #[test]
173    fn list_selects_volumes_and_chapters_in_document_order() {
174        let items = eval_list(&toc_list(), CATALOG).unwrap();
175        assert_eq!(items.len(), 5, "2 卷 + 3 章 = 5(排除 span 内的阅读进度)");
176    }
177
178    #[test]
179    fn toc_rules_split_into_volumes_and_chapters() {
180        let name = rule(
181            r#"{"firstOf":[{"via":"css","select":".module-row-title","extract":"text"},{"via":"css","select":"h2","extract":"text"}]}"#,
182        );
183        let url = rule(r#"{"via":"css","select":"a","extract":{"attr":"href"}}"#);
184        let is_volume = rule(r#"{"via":"css","select":"h2","extract":"text"}"#);
185        let vars = Vars::new();
186
187        let mut chapters = Vec::new();
188        let mut volumes = Vec::new();
189        for it in eval_list(&toc_list(), CATALOG).unwrap() {
190            let nm = eval_value(&name, &it, &vars).unwrap();
191            if eval_value(&is_volume, &it, &vars)
192                .unwrap()
193                .trim()
194                .is_empty()
195            {
196                let u = eval_value(&url, &it, &vars).unwrap();
197                chapters.push((nm, u));
198            } else {
199                volumes.push(nm);
200            }
201        }
202        assert_eq!(volumes, vec!["第一卷 魔性不改", "第二卷 魔子出山"]);
203        assert_eq!(chapters.len(), 3);
204        assert_eq!(
205            chapters[0],
206            ("第一章 甲".to_string(), "/n/1.html".to_string())
207        );
208        assert_eq!(
209            chapters[2],
210            ("第三章 丙".to_string(), "/n/3.html".to_string())
211        );
212    }
213
214    #[test]
215    fn book_info_extracts_og_meta_attr() {
216        let html = r#"<head><meta property="og:novel:book_name" content="蛊真人"><meta property="og:image" content="https://x/c.jpg"></head>"#;
217        let name = rule(
218            r#"{"via":"css","select":"[property=\"og:novel:book_name\"]","extract":{"attr":"content"}}"#,
219        );
220        assert_eq!(eval_value(&name, html, &Vars::new()).unwrap(), "蛊真人");
221    }
222
223    #[test]
224    fn content_html_extract_cleans_paragraphs() {
225        let html = r#"<div class="article-content"><p>第一段。</p><p>第二段。</p></div>"#;
226        let r = rule(
227            r#"{"via":"css","select":".article-content","extract":"html","clean":[{"trim":true}]}"#,
228        );
229        let out = eval_value(&r, html, &Vars::new()).unwrap();
230        assert!(out.contains("第一段。"));
231        assert!(out.contains("第二段。"));
232        assert!(out.contains('\n'), "段落间应有换行");
233    }
234
235    #[test]
236    fn clean_font_map_restores_via_inline_table() {
237        // camelCase "fontMap" 反序列化 + clean 流水线接线;fontMap 直接是「码点→字」表。
238        let r = rule(r#"{"via":"raw","clean":[{"fontMap":{"E001":"甲","E002":"乙"}}]}"#);
239        assert_eq!(
240            eval_value(&r, "\u{E001}\u{E002}!", &Vars::new()).unwrap(),
241            "甲乙!"
242        );
243    }
244
245    #[test]
246    fn template_interpolates_vars() {
247        let r = rule(r#"{"template":"{{base}}/search?q={{key}}&pg={{page}}"}"#);
248        let mut vars = Vars::new();
249        vars.insert("base".into(), "https://x.com".into());
250        vars.insert("key".into(), "蛊真人".into());
251        vars.insert("page".into(), "2".into());
252        assert_eq!(
253            eval_value(&r, "", &vars).unwrap(),
254            "https://x.com/search?q=蛊真人&pg=2"
255        );
256    }
257
258    #[test]
259    fn firstof_falls_back_to_second_when_first_empty() {
260        let r = rule(
261            r#"{"firstOf":[{"via":"css","select":".nope","extract":"text"},{"via":"css","select":"h2","extract":"text"}]}"#,
262        );
263        let html = r#"<h2>标题</h2>"#;
264        assert_eq!(eval_value(&r, html, &Vars::new()).unwrap(), "标题");
265    }
266
267    #[test]
268    fn clean_regex_replace_strips_boilerplate() {
269        let r = rule(
270            r#"{"via":"raw","clean":[{"regex":"请收藏本站[^\\n]*","replace":""},{"trim":true}]}"#,
271        );
272        let out = eval_value(&r, "正文内容 请收藏本站xxx.com", &Vars::new()).unwrap();
273        assert_eq!(out, "正文内容");
274    }
275
276    #[test]
277    fn clean_pipeline_decrypts_content() {
278        // 端到端:clean 链 cipher 步对「base64 密文」AES-CBC 解密出明文。
279        // 先用 transform 造出该书源会返回的 base64 密文(模拟服务端加密)。
280        use crate::source::{ByteEnc, CipherAlgo, CipherMode, CipherOp, CipherStep, Padding};
281        let plain = "蛊真人 第一章 正文……";
282        let ct = transform::cipher(
283            plain,
284            &CipherStep {
285                algo: CipherAlgo::Aes,
286                mode: CipherMode::Cbc,
287                padding: Padding::Pkcs7,
288                op: CipherOp::Encrypt,
289                key: "0123456789abcdef".into(),
290                key_enc: ByteEnc::Utf8,
291                iv: Some("abcdef9876543210".into()),
292                iv_enc: ByteEnc::Utf8,
293                input_enc: Some(ByteEnc::Utf8),
294                output_enc: Some(ByteEnc::Base64),
295            },
296        )
297        .unwrap();
298
299        // 书源侧:via:raw 取到密文,clean 用 cipher 步解密(默认 decrypt + inputEnc=base64)。
300        let r = rule(
301            r#"{"via":"raw","clean":[{"cipher":{"algo":"aes","mode":"cbc","key":"0123456789abcdef","iv":"abcdef9876543210"}}]}"#,
302        );
303        let out = eval_value(&r, &ct, &Vars::new()).unwrap();
304        assert_eq!(out, plain);
305    }
306
307    #[test]
308    fn clean_cipher_error_propagates() {
309        // 非法 base64 密文 → cipher 步报错(显式失败,不静默空)。
310        let r = rule(
311            r#"{"via":"raw","clean":[{"cipher":{"algo":"aes","mode":"cbc","key":"0123456789abcdef","iv":"abcdef9876543210"}}]}"#,
312        );
313        let err = eval_value(&r, "!!!not-base64!!!", &Vars::new());
314        assert!(matches!(
315            err,
316            Err(EvalError::Codec(_) | EvalError::Crypto(_))
317        ));
318    }
319
320    #[test]
321    fn js_rule_parses_regardless_of_feature() {
322        // Rule::Js 与 clean.js 变体恒可解析(配置可移植,不受 feature 影响)。
323        assert!(matches!(rule(r#"{"js":"result + '!'"}"#), Rule::Js { .. }));
324        let r = rule(r#"{"via":"raw","clean":[{"js":"result"}]}"#);
325        assert!(matches!(r, Rule::Leaf(_)));
326    }
327
328    #[cfg(not(feature = "js"))]
329    #[test]
330    fn js_rule_unsupported_without_feature() {
331        let r = rule(r#"{"js":"result + '!'"}"#);
332        assert!(matches!(
333            eval_value(&r, "x", &Vars::new()),
334            Err(EvalError::Unsupported("js"))
335        ));
336    }
337
338    #[cfg(feature = "js")]
339    #[test]
340    fn js_rule_evaluates_with_feature() {
341        let r = rule(r#"{"js":"result + '!'"}"#);
342        assert_eq!(eval_value(&r, "x", &Vars::new()).unwrap(), "x!");
343        // clean.js 也生效:取值后 JS 后处理。
344        let r2 = rule(r#"{"via":"raw","clean":[{"js":"result.toUpperCase()"}]}"#);
345        assert_eq!(eval_value(&r2, "abc", &Vars::new()).unwrap(), "ABC");
346    }
347}