Skip to main content

parse_book_source/
eval.rs

1//! 规则解释器(Interpreter + Composite)。递归遍历 [`Rule`] 求值;无字符串 DSL 解析。
2//!
3//! 两个入口:
4//! - [`eval_value`]:值规则 → 一个字符串。
5//! - [`eval_list`]:列表规则 → 多个「子上下文」内容串(每个供后续 item 规则求值)。
6
7use super::backend;
8use super::error::EvalError;
9use super::source::{CleanStep, Rule};
10use fancy_regex::Regex;
11use std::collections::HashMap;
12use std::sync::LazyLock;
13
14/// 模板插值变量表(`{{key}}` / `{{page}}` / `{{base}}` / 命名捕获)。
15pub type Vars = HashMap<String, String>;
16
17/// 对当前上下文求一个值。
18pub fn eval_value(rule: &Rule, ctx: &str, vars: &Vars) -> Result<String, EvalError> {
19    match rule {
20        Rule::Literal { literal } => Ok(literal.clone()),
21        Rule::Template { template } => Ok(interpolate(template, vars)),
22        Rule::FirstOf { first_of } => {
23            for r in first_of {
24                let v = eval_value(r, ctx, vars)?;
25                if !v.trim().is_empty() {
26                    return Ok(v);
27                }
28            }
29            Ok(String::new())
30        }
31        Rule::Concat { concat, join } => {
32            let mut parts = Vec::new();
33            for r in concat {
34                let v = eval_value(r, ctx, vars)?;
35                if !v.trim().is_empty() {
36                    parts.push(v);
37                }
38            }
39            Ok(parts.join(join))
40        }
41        Rule::Leaf(l) => {
42            let raw = backend::extract(l.via, ctx, l.select.as_deref(), l.index, &l.extract)?;
43            Ok(apply_clean(raw, &l.clean))
44        }
45    }
46}
47
48/// 选中所有匹配,返回各自的子上下文内容串。
49pub fn eval_list(rule: &Rule, ctx: &str) -> Result<Vec<String>, EvalError> {
50    match rule {
51        Rule::Leaf(l) => match l.select.as_deref() {
52            Some(sel) => backend::select_all(l.via, ctx, sel),
53            // 无选择器:把当前上下文作为单一项(而非把空串当非法选择器)。
54            None => Ok(vec![ctx.to_string()]),
55        },
56        Rule::FirstOf { first_of } => {
57            for r in first_of {
58                let v = eval_list(r, ctx)?;
59                if !v.is_empty() {
60                    return Ok(v);
61                }
62            }
63            Ok(Vec::new())
64        }
65        // literal/template/concat 作为列表无意义:退化为单值(若非空)。
66        other => {
67            let v = eval_value(other, ctx, &Vars::new())?;
68            Ok(if v.is_empty() { Vec::new() } else { vec![v] })
69        }
70    }
71}
72
73/// 应用清洗流水线(regex→replace、trim、prepend、append,按步顺序)。
74fn apply_clean(mut s: String, steps: &[CleanStep]) -> String {
75    for step in steps {
76        if let Some(pat) = &step.regex
77            && let Ok(re) = Regex::new(pat)
78        {
79            let rep = step.replace.as_deref().unwrap_or("");
80            s = re.replace_all(&s, rep).into_owned();
81        }
82        if step.trim.unwrap_or(false) {
83            s = s.trim().to_string();
84        }
85        if let Some(p) = &step.prepend {
86            s = format!("{p}{s}");
87        }
88        if let Some(a) = &step.append {
89            s = format!("{s}{a}");
90        }
91    }
92    s
93}
94
95/// 把 `{{key}}` 替换为变量值,未知键替换为空串。
96fn interpolate(template: &str, vars: &Vars) -> String {
97    static RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\{\{\s*([\w.\-]+)\s*\}\}").unwrap());
98    RE.replace_all(template, |c: &fancy_regex::Captures| {
99        c.get(1)
100            .and_then(|m| vars.get(m.as_str()))
101            .cloned()
102            .unwrap_or_default()
103    })
104    .into_owned()
105}
106
107#[cfg(test)]
108mod tests {
109    use super::*;
110    use crate::source::Rule;
111
112    fn rule(j: &str) -> Rule {
113        serde_json::from_str(j).expect("rule json")
114    }
115
116    // 合成的 bilixs 式目录:.box 内 直接子 h2(卷)+ a.module-row-text(章);
117    // span 包裹的"阅读进度"应被 `.box > h2` 排除。
118    const CATALOG: &str = r#"<html><body>
119      <div class="box">
120        <span id="shuqian"><h2 class="module-title type">阅读进度</h2></span>
121        <h2 class="module-title type">第一卷 魔性不改</h2>
122        <div class="module-row-info"><a class="module-row-text" href="/n/1.html"><i></i><div class="module-row-title"><span>第一章 甲</span></div></a></div>
123        <div class="module-row-info"><a class="module-row-text" href="/n/2.html"><i></i><div class="module-row-title"><span>第二章 乙</span></div></a></div>
124        <h2 class="module-title type">第二卷 魔子出山</h2>
125        <div class="module-row-info"><a class="module-row-text" href="/n/3.html"><i></i><div class="module-row-title"><span>第三章 丙</span></div></a></div>
126      </div>
127    </body></html>"#;
128
129    fn toc_list() -> Rule {
130        rule(r#"{"via":"css","select":".box > h2.module-title.type, .box a.module-row-text"}"#)
131    }
132
133    #[test]
134    fn list_selects_volumes_and_chapters_in_document_order() {
135        let items = eval_list(&toc_list(), CATALOG).unwrap();
136        assert_eq!(items.len(), 5, "2 卷 + 3 章 = 5(排除 span 内的阅读进度)");
137    }
138
139    #[test]
140    fn toc_rules_split_into_volumes_and_chapters() {
141        let name = rule(
142            r#"{"firstOf":[{"via":"css","select":".module-row-title","extract":"text"},{"via":"css","select":"h2","extract":"text"}]}"#,
143        );
144        let url = rule(r#"{"via":"css","select":"a","extract":{"attr":"href"}}"#);
145        let is_volume = rule(r#"{"via":"css","select":"h2","extract":"text"}"#);
146        let vars = Vars::new();
147
148        let mut chapters = Vec::new();
149        let mut volumes = Vec::new();
150        for it in eval_list(&toc_list(), CATALOG).unwrap() {
151            let nm = eval_value(&name, &it, &vars).unwrap();
152            if eval_value(&is_volume, &it, &vars)
153                .unwrap()
154                .trim()
155                .is_empty()
156            {
157                let u = eval_value(&url, &it, &vars).unwrap();
158                chapters.push((nm, u));
159            } else {
160                volumes.push(nm);
161            }
162        }
163        assert_eq!(volumes, vec!["第一卷 魔性不改", "第二卷 魔子出山"]);
164        assert_eq!(chapters.len(), 3);
165        assert_eq!(
166            chapters[0],
167            ("第一章 甲".to_string(), "/n/1.html".to_string())
168        );
169        assert_eq!(
170            chapters[2],
171            ("第三章 丙".to_string(), "/n/3.html".to_string())
172        );
173    }
174
175    #[test]
176    fn book_info_extracts_og_meta_attr() {
177        let html = r#"<head><meta property="og:novel:book_name" content="蛊真人"><meta property="og:image" content="https://x/c.jpg"></head>"#;
178        let name = rule(
179            r#"{"via":"css","select":"[property=\"og:novel:book_name\"]","extract":{"attr":"content"}}"#,
180        );
181        assert_eq!(eval_value(&name, html, &Vars::new()).unwrap(), "蛊真人");
182    }
183
184    #[test]
185    fn content_html_extract_cleans_paragraphs() {
186        let html = r#"<div class="article-content"><p>第一段。</p><p>第二段。</p></div>"#;
187        let r = rule(
188            r#"{"via":"css","select":".article-content","extract":"html","clean":[{"trim":true}]}"#,
189        );
190        let out = eval_value(&r, html, &Vars::new()).unwrap();
191        assert!(out.contains("第一段。"));
192        assert!(out.contains("第二段。"));
193        assert!(out.contains('\n'), "段落间应有换行");
194    }
195
196    #[test]
197    fn template_interpolates_vars() {
198        let r = rule(r#"{"template":"{{base}}/search?q={{key}}&pg={{page}}"}"#);
199        let mut vars = Vars::new();
200        vars.insert("base".into(), "https://x.com".into());
201        vars.insert("key".into(), "蛊真人".into());
202        vars.insert("page".into(), "2".into());
203        assert_eq!(
204            eval_value(&r, "", &vars).unwrap(),
205            "https://x.com/search?q=蛊真人&pg=2"
206        );
207    }
208
209    #[test]
210    fn firstof_falls_back_to_second_when_first_empty() {
211        let r = rule(
212            r#"{"firstOf":[{"via":"css","select":".nope","extract":"text"},{"via":"css","select":"h2","extract":"text"}]}"#,
213        );
214        let html = r#"<h2>标题</h2>"#;
215        assert_eq!(eval_value(&r, html, &Vars::new()).unwrap(), "标题");
216    }
217
218    #[test]
219    fn clean_regex_replace_strips_boilerplate() {
220        let r = rule(
221            r#"{"via":"raw","clean":[{"regex":"请收藏本站[^\\n]*","replace":""},{"trim":true}]}"#,
222        );
223        let out = eval_value(&r, "正文内容 请收藏本站xxx.com", &Vars::new()).unwrap();
224        assert_eq!(out, "正文内容");
225    }
226}