Skip to main content

parse_book_source/
engine.rs

1//! 用例引擎(Template Method + Paginator)。五个操作共享「取页 → 选列表/值 → 映射 →
2//! 可选有界分页」骨架;`Engine` 廉价 `Clone`(内部 `Arc`),操作不跨 await 持锁(D10)。
3
4use super::cookie::{
5    CookieJar, merge_login_into_headers, registrable_domain, request_registrable_domain,
6};
7use super::error::{BookSourceError, Result};
8use super::eval::{Vars, eval_list, eval_value, interpolate};
9use super::fetch::{FetchRequest, Fetcher, ReqwestFetcher};
10use super::model::{BookInfo, BookListItem, Chapter, Toc, Volume};
11use super::source::{
12    BookRules, BookSource, Capture, Category, Method, PreStep, Rule, UrlOrRule, VarScope,
13};
14use std::collections::{BTreeMap, HashMap};
15use std::sync::{Arc, RwLock};
16
17/// 书源运行时引擎。
18#[derive(Clone)]
19pub struct Engine {
20    source: Arc<BookSource>,
21    fetcher: Arc<dyn Fetcher>,
22    /// 登录态请求头(JWT/自定义头/Cookie 同路径),并入引擎构造的每个**同注册域**请求
23    /// (跨注册域请求跳过,防页面内容诱导的第三方 URL 外泄凭据,见 [`merge_login_into_headers`])。
24    /// 由调用方在登录后经 [`Engine::with_login_header`] 注入(来自 per-source 状态)。
25    login_header: BTreeMap<String, String>,
26    /// cookie 库(按注册域,session/persistent 分离):请求前合并进 `Cookie` 头,
27    /// `enabledCookieJar` 时响应 `Set-Cookie` 回灌。`Arc<RwLock>` 使 `Clone` 的引擎共享同一库。
28    cookies: Arc<RwLock<CookieJar>>,
29    /// 书源级捕获变量(`scope=source`,D7-bis):跨 op 共享(随 `Clone` 的引擎共享),
30    /// flatten 时最低优先级;适合站级常量。
31    source_vars: Arc<RwLock<BTreeMap<String, String>>>,
32    /// 书籍级捕获变量(`scope=book`,D7-bis):per-book,由 app 经 [`Engine::with_book_vars`]
33    /// 注入、[`Engine::book_vars`] 导出(随 per-book 快照持久化)。
34    book_vars: Arc<RwLock<BTreeMap<String, String>>>,
35}
36
37impl std::fmt::Debug for Engine {
38    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
39        f.debug_struct("Engine")
40            .field("source", &self.source.name)
41            .finish_non_exhaustive()
42    }
43}
44
45impl Engine {
46    /// 用默认 reqwest 取页后端构建。
47    pub fn new(source: BookSource) -> Result<Self> {
48        let fetcher = Arc::new(ReqwestFetcher::new(&source)?);
49        Ok(Self::with_fetcher(source, fetcher))
50    }
51
52    /// 注入自定义取页后端(测试替身 / 反爬适配器)构建。
53    /// 这是**唯一真实构造器**:共享字段(登录态/cookie 库/作用域变量)的默认初始化只在此处,
54    /// 其余构造器([`Engine::new`] / `with_browser_assist`)一律委托,避免新增字段漏改。
55    pub fn with_fetcher(source: BookSource, fetcher: Arc<dyn Fetcher>) -> Self {
56        Self {
57            source: Arc::new(source),
58            fetcher,
59            login_header: BTreeMap::new(),
60            cookies: Arc::new(RwLock::new(CookieJar::default())),
61            source_vars: Arc::new(RwLock::new(BTreeMap::new())),
62            book_vars: Arc::new(RwLock::new(BTreeMap::new())),
63        }
64    }
65
66    /// 注入登录态请求头(登录后由调用方从 per-source 状态取出)。链式构造:
67    /// `Engine::new(src)?.with_login_header(state.login_header)`。空 map 等同未登录。
68    #[must_use]
69    pub fn with_login_header(mut self, login_header: BTreeMap<String, String>) -> Self {
70        self.login_header = login_header;
71        self
72    }
73
74    /// 用持久化 cookie(`注册域 -> "k=v"`,来自 per-source 状态)初始化 cookie 库。链式构造。
75    #[must_use]
76    pub fn with_cookies(self, persistent: &BTreeMap<String, String>) -> Self {
77        if let Ok(mut jar) = self.cookies.write() {
78            *jar = CookieJar::from_persistent(persistent);
79        }
80        self
81    }
82
83    /// 导出当前 cookie 库中的 **persistent** cookie(`注册域 -> "k=v"`),供调用方落盘。
84    /// session cookie 不导出(重启失效)。
85    pub fn persistent_cookies(&self) -> BTreeMap<String, String> {
86        self.cookies
87            .read()
88            .map(|j| j.persistent())
89            .unwrap_or_default()
90    }
91
92    /// 注入书籍级捕获变量(`scope=book`,来自 per-book 快照)。链式构造(贴 [`Engine::with_login_header`])。
93    #[must_use]
94    pub fn with_book_vars(self, book_vars: BTreeMap<String, String>) -> Self {
95        if let Ok(mut g) = self.book_vars.write() {
96            *g = book_vars;
97        }
98        self
99    }
100
101    /// 合并书源级捕获变量(`scope=source`,来自 per-source 状态)。链式构造。
102    #[must_use]
103    pub fn with_source_vars(self, source_vars: &BTreeMap<String, String>) -> Self {
104        if let Ok(mut g) = self.source_vars.write() {
105            for (k, v) in source_vars {
106                g.insert(k.clone(), v.clone());
107            }
108        }
109        self
110    }
111
112    /// 导出书籍级捕获变量,供 app 随 per-book 快照落盘(`scope=book` 跨会话复用的承载)。
113    pub fn book_vars(&self) -> BTreeMap<String, String> {
114        self.book_vars.read().map(|g| g.clone()).unwrap_or_default()
115    }
116
117    /// 导出书源级捕获变量,供 app 落盘(可选;默认构建为进程内)。
118    pub fn source_vars(&self) -> BTreeMap<String, String> {
119        self.source_vars
120            .read()
121            .map(|g| g.clone())
122            .unwrap_or_default()
123    }
124
125    /// 用「升级式取页」构建(`browser` feature):平时 reqwest,撞挑战且 `browser` 为
126    /// `Some` 时升级解挑战。是否传入浏览器(书源 `http.fetcher` ∧ 用户授权 ∧ 探测到)
127    /// 的策略由调用方(app)决定;`None` 等同纯 reqwest(撞挑战即降级)。
128    #[cfg(feature = "browser")]
129    pub fn with_browser_assist(
130        source: BookSource,
131        browser: Option<crate::browser::BrowserFetcher>,
132    ) -> Result<Self> {
133        let fetcher = crate::browser::EscalatingFetcher::new(&source, browser)?;
134        Ok(Self::with_fetcher(source, Arc::new(fetcher)))
135    }
136
137    /// 暴露只读配置。
138    pub fn source(&self) -> &BookSource {
139        &self.source
140    }
141
142    /// 书源 URL(per-source 登录态文件的 key,供 app 回写 persistent cookie 落盘时定位)。
143    pub fn source_url(&self) -> &str {
144        &self.source.url
145    }
146
147    fn base_vars(&self) -> Vars {
148        let mut v = Vars::new();
149        v.insert(
150            "base".into(),
151            self.source.url.trim_end_matches('/').to_string(),
152        );
153        v
154    }
155
156    /// 构造一个带登录态的 GET 请求——引擎所有取页统一经此并入 loginHeader + cookie 库。
157    fn get_req(&self, url: impl Into<String>) -> FetchRequest {
158        let mut req = FetchRequest::get(url);
159        let url = req.url.clone();
160        self.apply_auth(&url, &mut req.headers);
161        req
162    }
163
164    /// 注册域(请求 URL 绝对则取其注册域,相对则取书源注册域)。
165    fn request_domain(&self, url: &str) -> String {
166        request_registrable_domain(url, &registrable_domain(&self.source.url))
167    }
168
169    /// 把登录态并入请求头(合并的最后一层),与 host 侧共用 [`merge_login_into_headers`]:
170    /// loginHeader 仅注入**同注册域**请求(防页面内容诱导的第三方 URL 外泄凭据);
171    /// Cookie = 已有头 Cookie ← loginHeader Cookie ← cookie 库(请求注册域)按 key 去重合并;
172    /// 全部值剥 CR/LF(已落盘的脏数据不致让 reqwest 构建失败、拖垮该书源全部请求)。
173    fn apply_auth(&self, url: &str, headers: &mut HashMap<String, String>) {
174        let source_domain = registrable_domain(&self.source.url);
175        let domain = request_registrable_domain(url, &source_domain);
176        let jar_cookie = self
177            .cookies
178            .read()
179            .ok()
180            .and_then(|j| j.cookie_header(&domain));
181        merge_login_into_headers(
182            &self.login_header,
183            &source_domain,
184            &domain,
185            jar_cookie.as_deref(),
186            headers,
187        );
188    }
189
190    /// 发请求(带登录态)→ `enabledCookieJar` 时回灌 `Set-Cookie` → `loginCheckJs` 校验登录态。
191    /// 失效返回 [`BookSourceError::LoginExpired`]。引擎所有取页统一经此。
192    async fn run_request(&self, req: FetchRequest) -> Result<String> {
193        let domain = self.request_domain(&req.url);
194        let resp = self.fetcher.fetch_full(req).await?;
195        if self.source.enabled_cookie_jar
196            && let Some(set_cookie) = resp.headers.get("set-cookie")
197            && let Ok(mut jar) = self.cookies.write()
198        {
199            jar.absorb_set_cookie(&domain, set_cookie);
200        }
201        self.check_login(&resp.body)?;
202        Ok(resp.body)
203    }
204
205    /// 取页(带登录态 + 回灌 + 登录校验)。
206    async fn fetch_checked(&self, url: impl Into<String>) -> Result<String> {
207        self.run_request(self.get_req(url)).await
208    }
209
210    /// `loginCheckJs`(响应期登录态校验,D10 第一版):脚本以 `result`=响应求值;
211    /// 返回空 / `false` / `0` 视为登录失效 → 抛 [`BookSourceError::LoginExpired`] 提示用户重登。
212    /// 空脚本或未启用 `js` feature 时为 no-op。
213    fn check_login(&self, response: &str) -> Result<()> {
214        let js = self.source.login_check_js.trim();
215        if js.is_empty() {
216            return Ok(());
217        }
218        #[cfg(feature = "js")]
219        {
220            let vars = self.base_vars();
221            let verdict = eval_value(&Rule::Js { js: js.to_string() }, response, &vars)?;
222            if matches!(verdict.trim(), "" | "false" | "0") {
223                return Err(BookSourceError::LoginExpired);
224            }
225        }
226        let _ = response;
227        Ok(())
228    }
229
230    /// 预热:按 `http.warmup` 先访问若干页以累积会话 cookie(失败忽略)。
231    /// 走 `Engine::run_request` 统一管线——`enabledCookieJar` 时预热页的 `Set-Cookie`
232    /// 才会回灌引擎 cookie 库(`loginCheckJs` 在预热页可能误判,但错误被吞,不影响预热语义)。
233    pub async fn warmup(&self) {
234        for u in &self.source.http.warmup {
235            let _ = self.run_request(self.get_req(u.clone())).await;
236        }
237    }
238
239    /// 书籍详情(可选前置请求链 → 取详情页 → 抽取)。
240    pub async fn book_info(&self, book_url: &str) -> Result<BookInfo> {
241        let mut chapter = self.base_vars();
242        self.run_prelude(&self.source.book_info.prelude, &mut chapter)
243            .await?;
244        let html = self.fetch_checked(book_url).await?;
245        let rules = self.source.book_info.as_book_rules();
246        self.eval_book_info(&rules, &html, &self.flatten(&chapter))
247    }
248
249    /// 目录(章节 + 分卷),支持前置请求链 + 有界分页。
250    pub async fn toc(&self, toc_url: &str) -> Result<Toc> {
251        let toc = &self.source.toc;
252        let mut chapter = self.base_vars();
253        self.run_prelude(&toc.prelude, &mut chapter).await?;
254        let vars = self.flatten(&chapter);
255        let pages = self
256            .fetch_pages(toc_url, toc.next_page.as_ref(), toc.max_pages, &vars)
257            .await?;
258
259        let mut chapters: Vec<Chapter> = Vec::new();
260        let mut volumes: Vec<Volume> = Vec::new();
261        for page in &pages {
262            for item in eval_list(&toc.list, page)? {
263                let title = eval_value(&toc.name, &item, &vars)?;
264                let is_volume = match &toc.is_volume {
265                    Some(r) => !eval_value(r, &item, &vars)?.trim().is_empty(),
266                    None => false,
267                };
268                if is_volume {
269                    volumes.push(Volume {
270                        title,
271                        first_chapter_index: chapters.len(),
272                    });
273                } else {
274                    let url = eval_value(&toc.url, &item, &vars)?;
275                    chapters.push(Chapter {
276                        title,
277                        url,
278                        is_volume: false,
279                    });
280                }
281            }
282        }
283        Ok(Toc { chapters, volumes })
284    }
285
286    /// 正文,支持前置请求链 + 有界分页。
287    pub async fn content(&self, chapter_url: &str) -> Result<String> {
288        let c = &self.source.content;
289        let mut chapter = self.base_vars();
290        self.run_prelude(&c.prelude, &mut chapter).await?;
291        let vars = self.flatten(&chapter);
292        let pages = self
293            .fetch_pages(chapter_url, c.next_page.as_ref(), c.max_pages, &vars)
294            .await?;
295        let mut parts = Vec::with_capacity(pages.len());
296        for page in &pages {
297            parts.push(eval_value(&c.value, page, &vars)?);
298        }
299        Ok(parts.join("\n"))
300    }
301
302    /// 搜索。
303    pub async fn search(&self, key: &str, page: u32, page_size: u32) -> Result<Vec<BookListItem>> {
304        let op = self
305            .source
306            .search
307            .as_ref()
308            .ok_or(BookSourceError::Missing("search"))?;
309        let mut chapter = self.base_vars();
310        chapter.insert("key".into(), key.to_string());
311        chapter.insert("page".into(), page.to_string());
312        chapter.insert("pageSize".into(), page_size.to_string());
313        self.run_prelude(&op.prelude, &mut chapter).await?;
314
315        let vars = self.flatten(&chapter);
316        let html = self
317            .send_templated(
318                &op.request.url,
319                op.request.method,
320                op.request.body.as_ref(),
321                &op.request.headers,
322                &vars,
323            )
324            .await?;
325        // 主请求 vars 捕获(chapter 级):对搜索响应求值,使 list/item 可见(captured-before-referenced)。
326        // flatten 刻意提在循环外:各条 vars **独立**对响应求值(见 source.rs `Request.vars` 契约
327        // 「勿互相引用」,有序依赖应走 prelude 链),也避免循环内重复克隆三层作用域。
328        let flat = self.flatten(&chapter);
329        for (name, rule) in &op.request.vars {
330            let v = eval_value(rule, &html, &flat)?;
331            if !v.is_empty() {
332                chapter.insert(name.clone(), v);
333            }
334        }
335        self.eval_list_items(&op.list, &op.item, &html, &self.flatten(&chapter))
336    }
337
338    /// 浏览某分类的某一页。
339    pub async fn explore(
340        &self,
341        category_url: &UrlOrRule,
342        page: u32,
343        page_size: u32,
344    ) -> Result<Vec<BookListItem>> {
345        let op = self
346            .source
347            .explore
348            .as_ref()
349            .ok_or(BookSourceError::Missing("explore"))?;
350        let mut chapter = self.base_vars();
351        chapter.insert("page".into(), page.to_string());
352        chapter.insert("pageSize".into(), page_size.to_string());
353        self.run_prelude(&op.prelude, &mut chapter).await?;
354        let vars = self.flatten(&chapter);
355        let url = self.resolve_url(category_url, &vars)?;
356        let html = self.fetch_checked(url).await?;
357        self.eval_list_items(&op.list, &op.item, &html, &vars)
358    }
359
360    /// 浏览分类列表,供上层选择后翻页。
361    pub fn explore_categories(&self) -> Vec<Category> {
362        self.source
363            .explore
364            .as_ref()
365            .map(|e| e.categories.clone())
366            .unwrap_or_default()
367    }
368
369    // ── 内部 ──
370
371    /// 把 chapter 层与引擎的 book/source 层 overlay 成单个扁平 `Vars`(`interpolate` 只吃扁平表)。
372    /// 优先级 `source < book < chapter`(高优先级后插覆盖)= get 时 章节→书籍→书源 取第一个非空。
373    fn flatten(&self, chapter: &Vars) -> Vars {
374        let mut out = Vars::new();
375        if let Ok(g) = self.source_vars.read() {
376            out.extend(g.iter().map(|(k, v)| (k.clone(), v.clone())));
377        }
378        if let Ok(g) = self.book_vars.read() {
379            out.extend(g.iter().map(|(k, v)| (k.clone(), v.clone())));
380        }
381        out.extend(chapter.iter().map(|(k, v)| (k.clone(), v.clone())));
382        out
383    }
384
385    /// 执行前置请求链(D7-bis):按数组顺序串行发请求,每步对其响应做命名捕获写入作用域。
386    /// `chapter` 是本次调用的临时层(含 base/key/page),chapter 级捕获就地累积;捕获天然先于引用
387    /// (响应后才捕获 + 数组顺序)。锁仅在求值前后瞬时持有,不跨 await(满足 D10)。
388    async fn run_prelude(&self, steps: &[PreStep], chapter: &mut Vars) -> Result<()> {
389        for step in steps {
390            // skipIfPresent:列出的 key 在作用域内全部非空 → 跳过本步(token 复用,省 RTT)。
391            if !step.skip_if_present.is_empty() {
392                let flat = self.flatten(chapter);
393                if step
394                    .skip_if_present
395                    .iter()
396                    .all(|k| flat.get(k).is_some_and(|v| !v.is_empty()))
397                {
398                    continue;
399                }
400            }
401            let flat = self.flatten(chapter);
402            let resp = self
403                .send_templated(
404                    &step.url,
405                    step.method,
406                    step.body.as_ref(),
407                    &step.headers,
408                    &flat,
409                )
410                .await?;
411            self.capture_into(&step.capture, &resp, chapter)?;
412        }
413        Ok(())
414    }
415
416    /// 发送一个「模板化请求」——search 主请求与 prelude 步骤共用的五步骨架:
417    /// resolve url/body → header 值 `{{name}}` 插值(可引用前置捕获的 token)→
418    /// 并入登录态(apply_auth)→ [`Engine::run_request`]。
419    /// `vars` 须为调用方已 flatten 的扁平表;请求后的差异化处理(`Request.vars` 捕获 /
420    /// prelude 的 `capture_into`)留在调用点。
421    async fn send_templated(
422        &self,
423        url: &UrlOrRule,
424        method: Method,
425        body: Option<&UrlOrRule>,
426        headers: &HashMap<String, String>,
427        vars: &Vars,
428    ) -> Result<String> {
429        let url = self.resolve_url(url, vars)?;
430        let body = match body {
431            Some(b) => Some(self.resolve_url(b, vars)?),
432            None => None,
433        };
434        let mut hdrs = HashMap::with_capacity(headers.len());
435        for (k, v) in headers {
436            hdrs.insert(k.clone(), interpolate(v, vars));
437        }
438        self.apply_auth(&url, &mut hdrs);
439        self.run_request(FetchRequest {
440            url,
441            method,
442            body,
443            headers: hdrs,
444        })
445        .await
446    }
447
448    /// 对一段响应按 `capture` 顺序求值并写入各作用域层;空串不写(防污染低优先级层的非空值)。
449    fn capture_into(&self, caps: &[Capture], body: &str, chapter: &mut Vars) -> Result<()> {
450        for cap in caps {
451            let v = eval_value(&cap.value, body, &self.flatten(chapter))?;
452            if v.is_empty() {
453                continue;
454            }
455            match cap.scope {
456                VarScope::Chapter => {
457                    chapter.insert(cap.name.clone(), v);
458                }
459                VarScope::Book => {
460                    if let Ok(mut g) = self.book_vars.write() {
461                        g.insert(cap.name.clone(), v);
462                    }
463                }
464                VarScope::Source => {
465                    if let Ok(mut g) = self.source_vars.write() {
466                        g.insert(cap.name.clone(), v);
467                    }
468                }
469            }
470        }
471        Ok(())
472    }
473
474    /// 有界分页抓取:从 `start` 起,若 `next_page` 求值得非空 URL 则续抓,直到为空或达 `max_pages`。
475    async fn fetch_pages(
476        &self,
477        start: &str,
478        next_page: Option<&Rule>,
479        max_pages: u32,
480        vars: &Vars,
481    ) -> Result<Vec<String>> {
482        let mut pages = Vec::new();
483        let mut url = start.to_string();
484        for _ in 0..max_pages.max(1) {
485            let html = self.fetch_checked(url.clone()).await?;
486            let next = match next_page {
487                Some(r) => eval_value(r, &html, vars)?,
488                None => String::new(),
489            };
490            pages.push(html);
491            if next.trim().is_empty() {
492                break;
493            }
494            url = next;
495        }
496        Ok(pages)
497    }
498
499    fn eval_list_items(
500        &self,
501        list: &Rule,
502        item: &BookRules,
503        html: &str,
504        vars: &Vars,
505    ) -> Result<Vec<BookListItem>> {
506        let mut out = Vec::new();
507        for ctx in eval_list(list, html)? {
508            let info = self.eval_book_info(item, &ctx, vars)?;
509            let book_url = opt_eval(item.book_url.as_ref(), &ctx, vars)?;
510            out.push(BookListItem { info, book_url });
511        }
512        Ok(out)
513    }
514
515    fn eval_book_info(&self, r: &BookRules, ctx: &str, vars: &Vars) -> Result<BookInfo> {
516        Ok(BookInfo {
517            name: opt_eval(r.name.as_ref(), ctx, vars)?,
518            author: opt_eval(r.author.as_ref(), ctx, vars)?,
519            cover: opt_eval(r.cover.as_ref(), ctx, vars)?,
520            intro: opt_eval(r.intro.as_ref(), ctx, vars)?,
521            kind: opt_eval(r.kind.as_ref(), ctx, vars)?,
522            last_chapter: opt_eval(r.last_chapter.as_ref(), ctx, vars)?,
523            toc_url: opt_eval(r.toc_url.as_ref(), ctx, vars)?,
524            word_count: opt_eval(r.word_count.as_ref(), ctx, vars)?,
525        })
526    }
527
528    fn resolve_url(&self, u: &UrlOrRule, vars: &Vars) -> Result<String> {
529        Ok(match u {
530            // 字符串按模板插值({{base}}/{{key}}/{{page}} 等)。
531            UrlOrRule::Str(s) => eval_value(
532                &Rule::Template {
533                    template: s.clone(),
534                },
535                "",
536                vars,
537            )?,
538            UrlOrRule::Rule(r) => eval_value(r, "", vars)?,
539        })
540    }
541}
542
543/// 求值一个可选规则;None 或空 → 空串。
544fn opt_eval(rule: Option<&Rule>, ctx: &str, vars: &Vars) -> Result<String> {
545    Ok(match rule {
546        Some(r) => eval_value(r, ctx, vars)?,
547        None => String::new(),
548    })
549}
550
551#[cfg(test)]
552mod tests {
553    use super::*;
554    use crate::error::FetchError;
555    use crate::fetch::{FetchResponse, Fetcher};
556    use async_trait::async_trait;
557
558    use std::sync::Mutex;
559
560    /// 注入固定 HTML 的取页替身,使引擎可离线单测(D9)。
561    struct MockFetcher(String);
562
563    #[async_trait]
564    impl Fetcher for MockFetcher {
565        async fn fetch(&self, _req: FetchRequest) -> std::result::Result<String, FetchError> {
566            Ok(self.0.clone())
567        }
568    }
569
570    /// 记录最近一次请求头的取页替身(验证引擎是否并入 loginHeader)。
571    struct RecordingFetcher {
572        body: String,
573        last_headers: Arc<Mutex<HashMap<String, String>>>,
574    }
575
576    #[async_trait]
577    impl Fetcher for RecordingFetcher {
578        async fn fetch(&self, req: FetchRequest) -> std::result::Result<String, FetchError> {
579            *self.last_headers.lock().unwrap() = req.headers;
580            Ok(self.body.clone())
581        }
582    }
583
584    /// 记录请求 Cookie 头并固定返回一个 `Set-Cookie` 的替身(验证 enabledCookieJar 回灌/再发)。
585    struct CookieEchoFetcher {
586        set_cookie: String,
587        last_cookie: Arc<Mutex<Option<String>>>,
588    }
589
590    #[async_trait]
591    impl Fetcher for CookieEchoFetcher {
592        async fn fetch(&self, req: FetchRequest) -> std::result::Result<String, FetchError> {
593            self.fetch_full(req).await.map(|r| r.body)
594        }
595        async fn fetch_full(
596            &self,
597            req: FetchRequest,
598        ) -> std::result::Result<FetchResponse, FetchError> {
599            *self.last_cookie.lock().unwrap() = req.headers.get("Cookie").cloned();
600            let mut headers = HashMap::new();
601            headers.insert("set-cookie".to_string(), self.set_cookie.clone());
602            Ok(FetchResponse {
603                body: CATALOG.to_string(),
604                status: 200,
605                headers,
606            })
607        }
608    }
609
610    const CATALOG: &str = r#"<html><body><div class="box">
611        <span id="shuqian"><h2 class="module-title type">阅读进度</h2></span>
612        <h2 class="module-title type">第一卷</h2>
613        <div class="module-row-info"><a class="module-row-text" href="/n/1.html"><div class="module-row-title"><span>第一章</span></div></a></div>
614        <div class="module-row-info"><a class="module-row-text" href="/n/2.html"><div class="module-row-title"><span>第二章</span></div></a></div>
615        <h2 class="module-title type">第二卷</h2>
616        <div class="module-row-info"><a class="module-row-text" href="/n/3.html"><div class="module-row-title"><span>第三章</span></div></a></div>
617        </div></body></html>"#;
618
619    const SOURCE: &str = r#"{
620      "schema":"trnovel-booksource/v2","name":"t","url":"https://x",
621      "bookInfo":{},
622      "toc":{
623        "list":{"via":"css","select":".box > h2.module-title.type, .box a.module-row-text"},
624        "name":{"firstOf":[{"via":"css","select":".module-row-title","extract":"text"},{"via":"css","select":"h2","extract":"text"}]},
625        "url":{"via":"css","select":"a","extract":{"attr":"href"}},
626        "isVolume":{"via":"css","select":"h2","extract":"text"},
627        "maxPages":1
628      },
629      "content":{"value":{"via":"css","select":".article-content","extract":"text"}}
630    }"#;
631
632    #[tokio::test]
633    async fn engine_toc_splits_volumes_offline() {
634        let src = BookSource::from_json(SOURCE).unwrap();
635        let engine = Engine::with_fetcher(src, Arc::new(MockFetcher(CATALOG.to_string())));
636        let toc = engine.toc("/any").await.unwrap();
637        assert_eq!(toc.volumes.len(), 2, "应识别 2 卷");
638        assert_eq!(toc.chapters.len(), 3, "应识别 3 章");
639        assert_eq!(toc.chapters[0].title, "第一章");
640        assert_eq!(toc.chapters[0].url, "/n/1.html");
641        assert_eq!(toc.volumes[1].first_chapter_index, 2);
642    }
643
644    // ── 6.3/6.4:引擎构造的请求并入 loginHeader(JWT/Cookie 同路径)──
645    #[tokio::test]
646    async fn engine_merges_login_header_into_requests() {
647        let src = BookSource::from_json(SOURCE).unwrap();
648        let captured = Arc::new(Mutex::new(HashMap::new()));
649        let fetcher = Arc::new(RecordingFetcher {
650            body: CATALOG.to_string(),
651            last_headers: captured.clone(),
652        });
653        let mut lh = BTreeMap::new();
654        lh.insert("Authorization".into(), "Bearer T".into());
655        lh.insert("Cookie".into(), "sid=1".into());
656        let engine = Engine::with_fetcher(src, fetcher).with_login_header(lh);
657
658        // 任一取页路径都应带上 loginHeader(此处走 toc → fetch_pages → get_req)。
659        engine.toc("/any").await.unwrap();
660        let h = captured.lock().unwrap();
661        assert_eq!(
662            h.get("Authorization").map(String::as_str),
663            Some("Bearer T"),
664            "JWT 应每请求携带"
665        );
666        assert_eq!(
667            h.get("Cookie").map(String::as_str),
668            Some("sid=1"),
669            "Cookie 走同一注入路径"
670        );
671    }
672
673    // ── 审查/security:loginHeader 仅注入同注册域请求(页面内容诱导的第三方 URL 不外泄凭据)──
674    #[tokio::test]
675    async fn login_header_not_sent_to_other_registrable_domain() {
676        let src = BookSource::from_json(SOURCE).unwrap(); // 书源 url "https://x"
677        let captured = Arc::new(Mutex::new(HashMap::new()));
678        let fetcher = Arc::new(RecordingFetcher {
679            body: CATALOG.to_string(),
680            last_headers: captured.clone(),
681        });
682        let mut lh = BTreeMap::new();
683        lh.insert("Authorization".into(), "Bearer T".into());
684        lh.insert("Cookie".into(), "sid=1".into());
685        let engine = Engine::with_fetcher(src, fetcher).with_login_header(lh);
686        // 绝对 URL 指向其它注册域(模拟被挂马页面把 toc/next_page 指向第三方域)。
687        engine.toc("https://evil.example.org/any").await.unwrap();
688        let h = captured.lock().unwrap();
689        assert!(
690            h.get("Authorization").is_none(),
691            "跨注册域不应携带登录头: {h:?}"
692        );
693        assert!(
694            h.get("Cookie").is_none(),
695            "跨注册域不应携带登录 Cookie: {h:?}"
696        );
697    }
698
699    // ── 审查/correctness:含 \n 的 loginHeader(脏落盘数据)经 apply_auth 剥除,引擎请求可送出 ──
700    #[tokio::test]
701    async fn newline_in_login_header_sanitized_in_engine_requests() {
702        let src = BookSource::from_json(SOURCE).unwrap();
703        let captured = Arc::new(Mutex::new(HashMap::new()));
704        let fetcher = Arc::new(RecordingFetcher {
705            body: CATALOG.to_string(),
706            last_headers: captured.clone(),
707        });
708        let mut lh = BTreeMap::new();
709        // 模拟脚本把 \n 连接的多 Set-Cookie 直接写回 Cookie 后落盘的脏数据。
710        lh.insert("Cookie".into(), "a=1\nb=2".into());
711        let engine = Engine::with_fetcher(src, fetcher).with_login_header(lh);
712        engine.toc("/any").await.unwrap();
713        let h = captured.lock().unwrap();
714        let cookie = h.get("Cookie").cloned().unwrap_or_default();
715        assert!(!cookie.contains('\n'), "Cookie 的 \\n 应被剥除: {cookie:?}");
716        assert_eq!(cookie, "a=1b=2", "与 host 侧 sanitize 行为对称");
717    }
718
719    // 未登录(空 loginHeader)时不注入任何额外头(向后兼容)。
720    #[tokio::test]
721    async fn engine_without_login_header_adds_nothing() {
722        let src = BookSource::from_json(SOURCE).unwrap();
723        let captured = Arc::new(Mutex::new(HashMap::new()));
724        let fetcher = Arc::new(RecordingFetcher {
725            body: CATALOG.to_string(),
726            last_headers: captured.clone(),
727        });
728        let engine = Engine::with_fetcher(src, fetcher);
729        engine.toc("/any").await.unwrap();
730        assert!(captured.lock().unwrap().is_empty(), "未登录不应注入额外头");
731    }
732
733    // ── 12.2:loginCheckJs 在响应期判定登录失效 → LoginExpired ──
734    #[cfg(feature = "js")]
735    #[tokio::test]
736    async fn login_check_js_detects_expired() {
737        let json = SOURCE.replacen(
738            "\"bookInfo\":{}",
739            "\"loginCheckJs\":\"result.indexOf('未登录')<0\",\"bookInfo\":{}",
740            1,
741        );
742        let src = BookSource::from_json(&json).unwrap();
743        // 响应含「未登录」→ 判失效。
744        let bad = Engine::with_fetcher(
745            src.clone(),
746            Arc::new(MockFetcher("<html>未登录</html>".into())),
747        );
748        let err = bad.toc("/any").await.unwrap_err();
749        assert!(err.is_login_expired(), "应判登录失效: {err}");
750        // 正常响应(无「未登录」)→ 放行。
751        let ok = Engine::with_fetcher(src, Arc::new(MockFetcher(CATALOG.to_string())));
752        assert!(ok.toc("/any").await.is_ok(), "正常响应不应判失效");
753    }
754
755    // ── 10.2/10.3/10.6:enabledCookieJar 回灌 Set-Cookie → 后续请求携带 → persistent 导出 ──
756    #[tokio::test]
757    async fn enabled_cookie_jar_absorbs_resends_and_persists() {
758        let json = SOURCE.replacen(
759            "\"bookInfo\":{}",
760            "\"enabledCookieJar\":true,\"bookInfo\":{}",
761            1,
762        );
763        let src = BookSource::from_json(&json).unwrap();
764        let last = Arc::new(Mutex::new(None));
765        let fetcher = Arc::new(CookieEchoFetcher {
766            set_cookie: "token=xyz; Max-Age=3600; Path=/".to_string(),
767            last_cookie: last.clone(),
768        });
769        let engine = Engine::with_fetcher(src, fetcher);
770
771        // 首请求:无 cookie 发出,响应 Set-Cookie 被回灌。
772        engine.toc("/p1").await.unwrap();
773        assert!(last.lock().unwrap().is_none(), "首请求不应带 cookie");
774        // 后续请求:回灌的 token 随请求发出。
775        engine.book_info("/p2").await.unwrap();
776        assert_eq!(
777            last.lock().unwrap().clone(),
778            Some("token=xyz".to_string()),
779            "回灌 cookie 应随后续请求发出"
780        );
781        // persistent 导出含 token(Max-Age → persistent),供 app 落盘。
782        // 书源 url "https://x" 的注册域为 "x"。
783        assert_eq!(
784            engine.persistent_cookies().get("x").map(String::as_str),
785            Some("token=xyz")
786        );
787    }
788
789    // ── 审查/correctness:warmup 走统一 run_request,enabledCookieJar 时预热页 Set-Cookie 回灌 ──
790    #[tokio::test]
791    async fn warmup_absorbs_set_cookie_into_jar() {
792        let json = SOURCE.replacen(
793            "\"bookInfo\":{}",
794            "\"enabledCookieJar\":true,\"http\":{\"warmup\":[\"https://x/warm\"]},\"bookInfo\":{}",
795            1,
796        );
797        let src = BookSource::from_json(&json).unwrap();
798        let last = Arc::new(Mutex::new(None));
799        let fetcher = Arc::new(CookieEchoFetcher {
800            set_cookie: "token=warm; Max-Age=3600; Path=/".to_string(),
801            last_cookie: last.clone(),
802        });
803        let engine = Engine::with_fetcher(src, fetcher);
804        engine.warmup().await;
805        // 预热页种下的 cookie 应进引擎 CookieJar(persistent 可导出落盘 / net.getCookie 可见)。
806        assert_eq!(
807            engine.persistent_cookies().get("x").map(String::as_str),
808            Some("token=warm"),
809            "预热页的 Set-Cookie 应回灌引擎 cookie 库"
810        );
811    }
812
813    // 未开 enabledCookieJar 时不回灌(向后兼容)。
814    #[tokio::test]
815    async fn cookie_jar_disabled_does_not_absorb() {
816        let src = BookSource::from_json(SOURCE).unwrap();
817        let last = Arc::new(Mutex::new(None));
818        let fetcher = Arc::new(CookieEchoFetcher {
819            set_cookie: "token=xyz; Max-Age=3600".to_string(),
820            last_cookie: last.clone(),
821        });
822        let engine = Engine::with_fetcher(src, fetcher);
823        engine.toc("/p1").await.unwrap();
824        engine.book_info("/p2").await.unwrap();
825        assert!(
826            last.lock().unwrap().is_none(),
827            "未开 cookieJar 不应回灌/再发"
828        );
829        assert!(engine.persistent_cookies().is_empty());
830    }
831
832    // ───────────────────── 11.x:前置请求链 / 结构化捕获 ─────────────────────
833
834    /// 按 URL 子串路由到不同响应体的替身(模拟前置链:prepare → 主请求),并记录调用 URL。
835    struct ScriptedFetcher {
836        routes: Vec<(String, String)>,
837        calls: Arc<Mutex<Vec<String>>>,
838    }
839
840    #[async_trait]
841    impl Fetcher for ScriptedFetcher {
842        async fn fetch(&self, req: FetchRequest) -> std::result::Result<String, FetchError> {
843            self.calls.lock().unwrap().push(req.url.clone());
844            for (pat, body) in &self.routes {
845                if req.url.contains(pat.as_str()) {
846                    return Ok(body.clone());
847                }
848            }
849            Ok(String::new())
850        }
851    }
852
853    fn scripted(routes: Vec<(&str, &str)>) -> (Arc<ScriptedFetcher>, Arc<Mutex<Vec<String>>>) {
854        let calls = Arc::new(Mutex::new(Vec::new()));
855        let f = Arc::new(ScriptedFetcher {
856            routes: routes
857                .into_iter()
858                .map(|(a, b)| (a.to_string(), b.to_string()))
859                .collect(),
860            calls: calls.clone(),
861        });
862        (f, calls)
863    }
864
865    // 前置请求链捕获 token → 带入主搜索请求 URL(headline:本 op 内多步)。
866    #[tokio::test]
867    async fn prelude_captures_token_into_main_request() {
868        let json = r#"{
869          "schema":"trnovel-booksource/v2","name":"t","url":"https://x",
870          "search":{
871            "prelude":[{"url":{"template":"{{base}}/prepare"},
872              "capture":[{"name":"token","value":{"via":"raw","clean":[{"trim":true}]},"scope":"chapter"}]}],
873            "request":{"url":{"template":"{{base}}/search?kw={{key}}&token={{token}}"}},
874            "list":{"via":"css","select":".item"},
875            "item":{"name":{"via":"css","select":".t","extract":"text"}}
876          },
877          "bookInfo":{},
878          "toc":{"list":{"via":"css","select":"a"},"name":{"via":"css","select":"a"},"url":{"via":"css","select":"a","extract":{"attr":"href"}}},
879          "content":{"value":{"via":"css","select":".c"}}
880        }"#;
881        let src = BookSource::from_json(json).unwrap();
882        let (f, calls) = scripted(vec![
883            ("/prepare", "ABC"),
884            (
885                "/search",
886                r#"<div class="item"><span class="t">书名</span></div>"#,
887            ),
888        ]);
889        let engine = Engine::with_fetcher(src, f);
890        let items = engine.search("k", 1, 20).await.unwrap();
891        assert_eq!(items.len(), 1);
892        assert_eq!(items[0].info.name, "书名");
893        let c = calls.lock().unwrap();
894        assert!(
895            c.iter().any(|u| u.contains("/prepare")),
896            "应先跑前置 prepare: {c:?}"
897        );
898        assert!(
899            c.iter().any(|u| u.contains("token=ABC")),
900            "主搜索应带捕获的 token: {c:?}"
901        );
902    }
903
904    // source 作用域 + skipIfPresent:同一引擎跨调用复用 token,prepare 只跑一次。
905    #[tokio::test]
906    async fn skip_if_present_reuses_source_scope_token() {
907        let json = r#"{
908          "schema":"trnovel-booksource/v2","name":"t","url":"https://x",
909          "search":{
910            "prelude":[{"url":{"template":"{{base}}/prepare"},
911              "capture":[{"name":"token","value":{"via":"raw","clean":[{"trim":true}]},"scope":"source"}],
912              "skipIfPresent":["token"]}],
913            "request":{"url":{"template":"{{base}}/search?token={{token}}"}},
914            "list":{"via":"css","select":".item"},
915            "item":{"name":{"via":"css","select":".t","extract":"text"}}
916          },
917          "bookInfo":{},
918          "toc":{"list":{"via":"css","select":"a"},"name":{"via":"css","select":"a"},"url":{"via":"css","select":"a","extract":{"attr":"href"}}},
919          "content":{"value":{"via":"css","select":".c"}}
920        }"#;
921        let src = BookSource::from_json(json).unwrap();
922        let (f, calls) = scripted(vec![
923            ("/prepare", "TKN"),
924            (
925                "/search",
926                r#"<div class="item"><span class="t">x</span></div>"#,
927            ),
928        ]);
929        let engine = Engine::with_fetcher(src, f);
930        engine.search("a", 1, 20).await.unwrap();
931        engine.search("b", 1, 20).await.unwrap();
932        let prepares = calls
933            .lock()
934            .unwrap()
935            .iter()
936            .filter(|u| u.contains("/prepare"))
937            .count();
938        assert_eq!(
939            prepares, 1,
940            "skipIfPresent 应使 source 级 token 复用,prepare 只跑一次"
941        );
942        assert_eq!(
943            engine.source_vars().get("token").map(String::as_str),
944            Some("TKN")
945        );
946    }
947
948    // 主请求 vars 捕获对 list/item 可见(自捕获边界:响应后才可见)。
949    #[tokio::test]
950    async fn request_vars_visible_to_list_items() {
951        let json = r#"{
952          "schema":"trnovel-booksource/v2","name":"t","url":"https://x",
953          "search":{
954            "request":{"url":{"template":"{{base}}/s"},
955              "vars":{"site":{"via":"css","select":".site","extract":"text"}}},
956            "list":{"via":"css","select":".item"},
957            "item":{"name":{"template":"{{site}}-书"}}
958          },
959          "bookInfo":{},
960          "toc":{"list":{"via":"css","select":"a"},"name":{"via":"css","select":"a"},"url":{"via":"css","select":"a","extract":{"attr":"href"}}},
961          "content":{"value":{"via":"css","select":".c"}}
962        }"#;
963        let src = BookSource::from_json(json).unwrap();
964        let html = r#"<span class="site">甲站</span><div class="item">x</div>"#;
965        let engine = Engine::with_fetcher(src, Arc::new(MockFetcher(html.to_string())));
966        let items = engine.search("k", 1, 20).await.unwrap();
967        assert_eq!(items.len(), 1);
968        assert_eq!(
969            items[0].info.name, "甲站-书",
970            "item 模板应看到主请求捕获的 site"
971        );
972    }
973
974    // 空串捕获不写作用域层(抽取失败 → {{x}} 落空串,不污染)。
975    #[tokio::test]
976    async fn empty_capture_not_written() {
977        let json = r#"{
978          "schema":"trnovel-booksource/v2","name":"t","url":"https://x",
979          "search":{
980            "prelude":[{"url":{"template":"{{base}}/p"},
981              "capture":[{"name":"x","value":{"via":"css","select":".nope","extract":"text"},"scope":"source"}]}],
982            "request":{"url":{"template":"{{base}}/s?x={{x}}"}},
983            "list":{"via":"css","select":".item"},
984            "item":{"name":{"via":"css","select":".t","extract":"text"}}
985          },
986          "bookInfo":{},
987          "toc":{"list":{"via":"css","select":"a"},"name":{"via":"css","select":"a"},"url":{"via":"css","select":"a","extract":{"attr":"href"}}},
988          "content":{"value":{"via":"css","select":".c"}}
989        }"#;
990        let src = BookSource::from_json(json).unwrap();
991        let (f, calls) = scripted(vec![
992            ("/p", "<html></html>"),
993            ("/s", r#"<div class="item"><span class="t">y</span></div>"#),
994        ]);
995        let engine = Engine::with_fetcher(src, f);
996        engine.search("k", 1, 20).await.unwrap();
997        assert!(
998            !engine.source_vars().contains_key("x"),
999            "空串捕获不应写作用域层"
1000        );
1001        assert!(
1002            calls.lock().unwrap().iter().any(|u| u.contains("/s?x=")),
1003            "主请求应照常发出(x 为空串)"
1004        );
1005    }
1006
1007    // toc 前置 csrf → 目录抽取规则(concat)引用 {{csrf}}(headline:前置链 + 抽取可见捕获)。
1008    #[tokio::test]
1009    async fn toc_prelude_csrf_visible_to_extraction() {
1010        let json = r#"{
1011          "schema":"trnovel-booksource/v2","name":"t","url":"https://x",
1012          "bookInfo":{},
1013          "toc":{
1014            "prelude":[{"url":{"template":"{{base}}/prepare"},
1015              "capture":[{"name":"csrf","value":{"via":"raw","clean":[{"trim":true}]},"scope":"chapter"}]}],
1016            "list":{"via":"css","select":".ch"},
1017            "name":{"via":"css","select":"a","extract":"text"},
1018            "url":{"concat":[{"literal":"/c?sign="},{"template":"{{csrf}}"},{"literal":"&href="},{"via":"css","select":"a","extract":{"attr":"href"}}]},
1019            "maxPages":1
1020          },
1021          "content":{"value":{"via":"css","select":".c"}}
1022        }"#;
1023        let src = BookSource::from_json(json).unwrap();
1024        let (f, _calls) = scripted(vec![
1025            ("/prepare", "SIG"),
1026            (
1027                "/toc",
1028                r#"<div class="ch"><a href="/n/1.html">第一章</a></div>"#,
1029            ),
1030        ]);
1031        let engine = Engine::with_fetcher(src, f);
1032        let toc = engine.toc("/toc/1").await.unwrap();
1033        assert_eq!(toc.chapters.len(), 1);
1034        assert_eq!(
1035            toc.chapters[0].url, "/c?sign=SIG&href=/n/1.html",
1036            "目录 url 应拼入前置捕获的 csrf"
1037        );
1038    }
1039
1040    // 审查 fix1:主请求 header 值支持 {{name}} 模板,可引用前置捕获的 token。
1041    #[tokio::test]
1042    async fn main_request_headers_interpolate_captured_vars() {
1043        let json = r#"{
1044          "schema":"trnovel-booksource/v2","name":"t","url":"https://x",
1045          "search":{
1046            "prelude":[{"url":{"template":"{{base}}/prepare"},
1047              "capture":[{"name":"token","value":{"via":"raw","clean":[{"trim":true}]},"scope":"chapter"}]}],
1048            "request":{"url":{"template":"{{base}}/search"},
1049              "headers":{"Authorization":"Bearer {{token}}"}},
1050            "list":{"via":"css","select":".item"},
1051            "item":{"name":{"via":"css","select":".t","extract":"text"}}
1052          },
1053          "bookInfo":{},
1054          "toc":{"list":{"via":"css","select":"a"},"name":{"via":"css","select":"a"},"url":{"via":"css","select":"a","extract":{"attr":"href"}}},
1055          "content":{"value":{"via":"css","select":".c"}}
1056        }"#;
1057        let src = BookSource::from_json(json).unwrap();
1058        let seen = Arc::new(Mutex::new(None));
1059        struct HeaderProbe {
1060            seen: Arc<Mutex<Option<String>>>,
1061        }
1062        #[async_trait]
1063        impl Fetcher for HeaderProbe {
1064            async fn fetch(&self, req: FetchRequest) -> std::result::Result<String, FetchError> {
1065                if req.url.contains("/search") {
1066                    *self.seen.lock().unwrap() = req.headers.get("Authorization").cloned();
1067                    return Ok(r#"<div class="item"><span class="t">书</span></div>"#.to_string());
1068                }
1069                Ok("ABC".to_string()) // /prepare
1070            }
1071        }
1072        let engine = Engine::with_fetcher(src, Arc::new(HeaderProbe { seen: seen.clone() }));
1073        engine.search("k", 1, 20).await.unwrap();
1074        assert_eq!(
1075            seen.lock().unwrap().clone(),
1076            Some("Bearer ABC".to_string()),
1077            "主请求 header 应插值前置捕获的 token"
1078        );
1079    }
1080
1081    // 审查 fix2:多条 Request.vars 都被捕获且对 item 可见(BTreeMap 确定序)。
1082    #[tokio::test]
1083    async fn multiple_request_vars_all_captured() {
1084        let json = r#"{
1085          "schema":"trnovel-booksource/v2","name":"t","url":"https://x",
1086          "search":{
1087            "request":{"url":{"template":"{{base}}/s"},
1088              "vars":{
1089                "a":{"via":"css","select":".a","extract":"text"},
1090                "b":{"via":"css","select":".b","extract":"text"}
1091              }},
1092            "list":{"via":"css","select":".item"},
1093            "item":{"name":{"template":"{{a}}-{{b}}"}}
1094          },
1095          "bookInfo":{},
1096          "toc":{"list":{"via":"css","select":"a"},"name":{"via":"css","select":"a"},"url":{"via":"css","select":"a","extract":{"attr":"href"}}},
1097          "content":{"value":{"via":"css","select":".c"}}
1098        }"#;
1099        let src = BookSource::from_json(json).unwrap();
1100        let html = r#"<span class="a">甲</span><span class="b">乙</span><div class="item">x</div>"#;
1101        let engine = Engine::with_fetcher(src, Arc::new(MockFetcher(html.to_string())));
1102        let items = engine.search("k", 1, 20).await.unwrap();
1103        assert_eq!(
1104            items[0].info.name, "甲-乙",
1105            "多条 request.vars 应都被捕获且对 item 可见"
1106        );
1107    }
1108}