Skip to main content

parse_book_source/
engine.rs

1//! 用例引擎(Template Method + Paginator)。五个操作共享「取页 → 选列表/值 → 映射 →
2//! 可选有界分页」骨架;`Engine` 廉价 `Clone`(内部 `Arc`),操作不跨 await 持锁(D10)。
3
4use super::error::{BookSourceError, Result};
5use super::eval::{Vars, eval_list, eval_value};
6use super::fetch::{FetchRequest, Fetcher, ReqwestFetcher};
7use super::model::{BookInfo, BookListItem, Chapter, Toc, Volume};
8use super::source::{BookRules, BookSource, Category, Rule, UrlOrRule};
9use std::sync::Arc;
10
11/// 书源运行时引擎。
12#[derive(Clone)]
13pub struct Engine {
14    source: Arc<BookSource>,
15    fetcher: Arc<dyn Fetcher>,
16}
17
18impl std::fmt::Debug for Engine {
19    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
20        f.debug_struct("Engine")
21            .field("source", &self.source.name)
22            .finish_non_exhaustive()
23    }
24}
25
26impl Engine {
27    /// 用默认 reqwest 取页后端构建。
28    pub fn new(source: BookSource) -> Result<Self> {
29        let fetcher = Arc::new(ReqwestFetcher::new(&source)?);
30        Ok(Self {
31            source: Arc::new(source),
32            fetcher,
33        })
34    }
35
36    /// 注入自定义取页后端(便于测试替身 / 反爬适配器)。
37    pub fn with_fetcher(source: BookSource, fetcher: Arc<dyn Fetcher>) -> Self {
38        Self {
39            source: Arc::new(source),
40            fetcher,
41        }
42    }
43
44    /// 用「升级式取页」构建(`browser` feature):平时 reqwest,撞挑战且 `browser` 为
45    /// `Some` 时升级解挑战。是否传入浏览器(书源 `http.fetcher` ∧ 用户授权 ∧ 探测到)
46    /// 的策略由调用方(app)决定;`None` 等同纯 reqwest(撞挑战即降级)。
47    #[cfg(feature = "browser")]
48    pub fn with_browser_assist(
49        source: BookSource,
50        browser: Option<crate::browser::BrowserFetcher>,
51    ) -> Result<Self> {
52        let fetcher = crate::browser::EscalatingFetcher::new(&source, browser)?;
53        Ok(Self {
54            source: Arc::new(source),
55            fetcher: Arc::new(fetcher),
56        })
57    }
58
59    /// 暴露只读配置。
60    pub fn source(&self) -> &BookSource {
61        &self.source
62    }
63
64    fn base_vars(&self) -> Vars {
65        let mut v = Vars::new();
66        v.insert(
67            "base".into(),
68            self.source.url.trim_end_matches('/').to_string(),
69        );
70        v
71    }
72
73    /// 预热:按 `http.warmup` 先访问若干页以累积会话 cookie(失败忽略)。
74    pub async fn warmup(&self) {
75        for u in &self.source.http.warmup {
76            let _ = self.fetcher.fetch(FetchRequest::get(u.clone())).await;
77        }
78    }
79
80    /// 书籍详情。
81    pub async fn book_info(&self, book_url: &str) -> Result<BookInfo> {
82        let html = self.fetcher.fetch(FetchRequest::get(book_url)).await?;
83        let vars = self.base_vars();
84        self.eval_book_info(&self.source.book_info, &html, &vars)
85    }
86
87    /// 目录(章节 + 分卷),支持有界分页。
88    pub async fn toc(&self, toc_url: &str) -> Result<Toc> {
89        let toc = &self.source.toc;
90        let vars = self.base_vars();
91        let pages = self
92            .fetch_pages(toc_url, toc.next_page.as_ref(), toc.max_pages, &vars)
93            .await?;
94
95        let mut chapters: Vec<Chapter> = Vec::new();
96        let mut volumes: Vec<Volume> = Vec::new();
97        for page in &pages {
98            for item in eval_list(&toc.list, page)? {
99                let title = eval_value(&toc.name, &item, &vars)?;
100                let is_volume = match &toc.is_volume {
101                    Some(r) => !eval_value(r, &item, &vars)?.trim().is_empty(),
102                    None => false,
103                };
104                if is_volume {
105                    volumes.push(Volume {
106                        title,
107                        first_chapter_index: chapters.len(),
108                    });
109                } else {
110                    let url = eval_value(&toc.url, &item, &vars)?;
111                    chapters.push(Chapter {
112                        title,
113                        url,
114                        is_volume: false,
115                    });
116                }
117            }
118        }
119        Ok(Toc { chapters, volumes })
120    }
121
122    /// 正文,支持有界分页。
123    pub async fn content(&self, chapter_url: &str) -> Result<String> {
124        let c = &self.source.content;
125        let vars = self.base_vars();
126        let pages = self
127            .fetch_pages(chapter_url, c.next_page.as_ref(), c.max_pages, &vars)
128            .await?;
129        let mut parts = Vec::with_capacity(pages.len());
130        for page in &pages {
131            parts.push(eval_value(&c.value, page, &vars)?);
132        }
133        Ok(parts.join("\n"))
134    }
135
136    /// 搜索。
137    pub async fn search(&self, key: &str, page: u32, page_size: u32) -> Result<Vec<BookListItem>> {
138        let op = self
139            .source
140            .search
141            .as_ref()
142            .ok_or(BookSourceError::Missing("search"))?;
143        let mut vars = self.base_vars();
144        vars.insert("key".into(), key.to_string());
145        vars.insert("page".into(), page.to_string());
146        vars.insert("pageSize".into(), page_size.to_string());
147
148        let url = self.resolve_url(&op.request.url, &vars)?;
149        let body = match &op.request.body {
150            Some(b) => Some(self.resolve_url(b, &vars)?),
151            None => None,
152        };
153        let html = self
154            .fetcher
155            .fetch(FetchRequest {
156                url,
157                method: op.request.method,
158                body,
159                headers: op.request.headers.clone(),
160            })
161            .await?;
162        self.eval_list_items(&op.list, &op.item, &html)
163    }
164
165    /// 浏览某分类的某一页。
166    pub async fn explore(
167        &self,
168        category_url: &UrlOrRule,
169        page: u32,
170        page_size: u32,
171    ) -> Result<Vec<BookListItem>> {
172        let op = self
173            .source
174            .explore
175            .as_ref()
176            .ok_or(BookSourceError::Missing("explore"))?;
177        let mut vars = self.base_vars();
178        vars.insert("page".into(), page.to_string());
179        vars.insert("pageSize".into(), page_size.to_string());
180        let url = self.resolve_url(category_url, &vars)?;
181        let html = self.fetcher.fetch(FetchRequest::get(url)).await?;
182        self.eval_list_items(&op.list, &op.item, &html)
183    }
184
185    /// 浏览分类列表,供上层选择后翻页。
186    pub fn explore_categories(&self) -> Vec<Category> {
187        self.source
188            .explore
189            .as_ref()
190            .map(|e| e.categories.clone())
191            .unwrap_or_default()
192    }
193
194    // ── 内部 ──
195
196    /// 有界分页抓取:从 `start` 起,若 `next_page` 求值得非空 URL 则续抓,直到为空或达 `max_pages`。
197    async fn fetch_pages(
198        &self,
199        start: &str,
200        next_page: Option<&Rule>,
201        max_pages: u32,
202        vars: &Vars,
203    ) -> Result<Vec<String>> {
204        let mut pages = Vec::new();
205        let mut url = start.to_string();
206        for _ in 0..max_pages.max(1) {
207            let html = self.fetcher.fetch(FetchRequest::get(url.clone())).await?;
208            let next = match next_page {
209                Some(r) => eval_value(r, &html, vars)?,
210                None => String::new(),
211            };
212            pages.push(html);
213            if next.trim().is_empty() {
214                break;
215            }
216            url = next;
217        }
218        Ok(pages)
219    }
220
221    fn eval_list_items(
222        &self,
223        list: &Rule,
224        item: &BookRules,
225        html: &str,
226    ) -> Result<Vec<BookListItem>> {
227        let vars = self.base_vars();
228        let mut out = Vec::new();
229        for ctx in eval_list(list, html)? {
230            let info = self.eval_book_info(item, &ctx, &vars)?;
231            let book_url = opt_eval(item.book_url.as_ref(), &ctx, &vars)?;
232            out.push(BookListItem { info, book_url });
233        }
234        Ok(out)
235    }
236
237    fn eval_book_info(&self, r: &BookRules, ctx: &str, vars: &Vars) -> Result<BookInfo> {
238        Ok(BookInfo {
239            name: opt_eval(r.name.as_ref(), ctx, vars)?,
240            author: opt_eval(r.author.as_ref(), ctx, vars)?,
241            cover: opt_eval(r.cover.as_ref(), ctx, vars)?,
242            intro: opt_eval(r.intro.as_ref(), ctx, vars)?,
243            kind: opt_eval(r.kind.as_ref(), ctx, vars)?,
244            last_chapter: opt_eval(r.last_chapter.as_ref(), ctx, vars)?,
245            toc_url: opt_eval(r.toc_url.as_ref(), ctx, vars)?,
246            word_count: opt_eval(r.word_count.as_ref(), ctx, vars)?,
247        })
248    }
249
250    fn resolve_url(&self, u: &UrlOrRule, vars: &Vars) -> Result<String> {
251        Ok(match u {
252            // 字符串按模板插值({{base}}/{{key}}/{{page}} 等)。
253            UrlOrRule::Str(s) => eval_value(
254                &Rule::Template {
255                    template: s.clone(),
256                },
257                "",
258                vars,
259            )?,
260            UrlOrRule::Rule(r) => eval_value(r, "", vars)?,
261        })
262    }
263}
264
265/// 求值一个可选规则;None 或空 → 空串。
266fn opt_eval(rule: Option<&Rule>, ctx: &str, vars: &Vars) -> Result<String> {
267    Ok(match rule {
268        Some(r) => eval_value(r, ctx, vars)?,
269        None => String::new(),
270    })
271}
272
273#[cfg(test)]
274mod tests {
275    use super::*;
276    use crate::error::FetchError;
277    use crate::fetch::Fetcher;
278    use async_trait::async_trait;
279
280    /// 注入固定 HTML 的取页替身,使引擎可离线单测(D9)。
281    struct MockFetcher(String);
282
283    #[async_trait]
284    impl Fetcher for MockFetcher {
285        async fn fetch(&self, _req: FetchRequest) -> std::result::Result<String, FetchError> {
286            Ok(self.0.clone())
287        }
288    }
289
290    const CATALOG: &str = r#"<html><body><div class="box">
291        <span id="shuqian"><h2 class="module-title type">阅读进度</h2></span>
292        <h2 class="module-title type">第一卷</h2>
293        <div class="module-row-info"><a class="module-row-text" href="/n/1.html"><div class="module-row-title"><span>第一章</span></div></a></div>
294        <div class="module-row-info"><a class="module-row-text" href="/n/2.html"><div class="module-row-title"><span>第二章</span></div></a></div>
295        <h2 class="module-title type">第二卷</h2>
296        <div class="module-row-info"><a class="module-row-text" href="/n/3.html"><div class="module-row-title"><span>第三章</span></div></a></div>
297        </div></body></html>"#;
298
299    const SOURCE: &str = r#"{
300      "schema":"trnovel-booksource/v2","name":"t","url":"https://x",
301      "bookInfo":{},
302      "toc":{
303        "list":{"via":"css","select":".box > h2.module-title.type, .box a.module-row-text"},
304        "name":{"firstOf":[{"via":"css","select":".module-row-title","extract":"text"},{"via":"css","select":"h2","extract":"text"}]},
305        "url":{"via":"css","select":"a","extract":{"attr":"href"}},
306        "isVolume":{"via":"css","select":"h2","extract":"text"},
307        "maxPages":1
308      },
309      "content":{"value":{"via":"css","select":".article-content","extract":"text"}}
310    }"#;
311
312    #[tokio::test]
313    async fn engine_toc_splits_volumes_offline() {
314        let src = BookSource::from_json(SOURCE).unwrap();
315        let engine = Engine::with_fetcher(src, Arc::new(MockFetcher(CATALOG.to_string())));
316        let toc = engine.toc("/any").await.unwrap();
317        assert_eq!(toc.volumes.len(), 2, "应识别 2 卷");
318        assert_eq!(toc.chapters.len(), 3, "应识别 3 章");
319        assert_eq!(toc.chapters[0].title, "第一章");
320        assert_eq!(toc.chapters[0].url, "/n/1.html");
321        assert_eq!(toc.volumes[1].first_chapter_index, 2);
322    }
323}