Skip to main content

parse_book_source/
source.rs

1//! v2 书源配置类型(纯 serde,镜像 `book-source.schema.json`)。
2//!
3//! 规则是显式结构化对象,无任何紧凑字符串 DSL。`Rule` 既是配置、也是供求值器
4//! 遍历的语法树(见 design D1/D6)。
5
6use super::error::ConfigError;
7use serde::{Deserialize, Serialize};
8use std::collections::HashMap;
9
10// ───────────────────────── 规则 AST ─────────────────────────
11
12/// 抽取后端(决定 `select` 的语义)。
13#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize, Hash)]
14#[serde(rename_all = "lowercase")]
15#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
16pub enum Via {
17    #[default]
18    Css,
19    Xpath,
20    Json,
21    Regex,
22    /// 直接使用当前上下文值(只跑 clean)。
23    Raw,
24}
25
26/// 取值方式(枚举字符串 或 `{ "attr": "..." }`)。
27#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Hash)]
28#[serde(untagged)]
29#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
30pub enum Extract {
31    Op(ExtractOp),
32    Attr { attr: String },
33}
34
35impl Default for Extract {
36    fn default() -> Self {
37        Extract::Op(ExtractOp::Text)
38    }
39}
40
41/// 文本/HTML 取值算子。
42#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize, Hash)]
43#[serde(rename_all = "camelCase")]
44#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
45pub enum ExtractOp {
46    #[default]
47    Text,
48    OwnText,
49    Html,
50    InnerHtml,
51    OuterHtml,
52}
53
54/// 单步后处理。
55#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize, Hash)]
56#[serde(deny_unknown_fields)]
57#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
58pub struct CleanStep {
59    #[serde(default, skip_serializing_if = "Option::is_none")]
60    pub regex: Option<String>,
61    #[serde(default, skip_serializing_if = "Option::is_none")]
62    pub replace: Option<String>,
63    #[serde(default, skip_serializing_if = "Option::is_none")]
64    pub trim: Option<bool>,
65    #[serde(default, skip_serializing_if = "Option::is_none")]
66    pub prepend: Option<String>,
67    #[serde(default, skip_serializing_if = "Option::is_none")]
68    pub append: Option<String>,
69}
70
71/// 叶子规则:在当前上下文做一次抽取。
72#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize, Hash)]
73#[serde(rename_all = "camelCase")]
74#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
75pub struct LeafRule {
76    #[serde(default)]
77    pub via: Via,
78    #[serde(default, skip_serializing_if = "Option::is_none")]
79    pub select: Option<String>,
80    #[serde(default, skip_serializing_if = "Option::is_none")]
81    pub index: Option<i64>,
82    #[serde(default)]
83    pub extract: Extract,
84    #[serde(default, skip_serializing_if = "Vec::is_empty")]
85    pub clean: Vec<CleanStep>,
86}
87
88/// 一条规则:叶子,或组合子。组合子按其唯一键判别(见 design D1)。
89///
90/// 反序列化时按变体顺序尝试:组合子(各有唯一必填键)在前,叶子兜底。
91#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Hash)]
92#[serde(untagged)]
93#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
94pub enum Rule {
95    /// 取首个非空子规则结果(回退/自愈)。
96    FirstOf {
97        #[serde(rename = "firstOf")]
98        first_of: Vec<Rule>,
99    },
100    /// 拼接非空子规则结果。
101    Concat {
102        concat: Vec<Rule>,
103        #[serde(default)]
104        join: String,
105    },
106    /// 字面量。
107    Literal { literal: String },
108    /// 模板插值(`{{key}}`/`{{page}}`/命名变量)。
109    Template { template: String },
110    /// 叶子(兜底)。
111    Leaf(LeafRule),
112}
113
114/// URL 字段:可为字符串模板,或一条规则。
115#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Hash)]
116#[serde(untagged)]
117#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
118pub enum UrlOrRule {
119    Str(String),
120    Rule(Box<Rule>),
121}
122
123// ───────────────────────── HTTP / 请求 ─────────────────────────
124
125/// 字符集。
126#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)]
127#[serde(rename_all = "kebab-case")]
128#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
129pub enum Charset {
130    #[default]
131    Auto,
132    Utf8,
133    Gbk,
134    Gb18030,
135    Big5,
136}
137
138/// 重试策略。
139#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
140#[serde(rename_all = "camelCase", deny_unknown_fields)]
141#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
142pub struct Retry {
143    #[serde(default)]
144    pub max: u32,
145    #[serde(default)]
146    pub backoff_ms: u64,
147}
148
149/// 速率限制。
150#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
151#[serde(rename_all = "camelCase", deny_unknown_fields)]
152#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
153pub struct RateLimit {
154    pub max_count: u64,
155    pub per_ms: u64,
156}
157
158/// 取页模式:是否动用浏览器解反爬挑战。
159/// 真正是否开浏览器还需 app/用户级授权(两级取交集,见 OpenSpec change `browser-fetcher` D12)。
160#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)]
161#[serde(rename_all = "lowercase")]
162#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
163pub enum FetchMode {
164    /// 默认:平时 reqwest,撞挑战才升级浏览器。
165    #[default]
166    Auto,
167    /// 永不开浏览器,撞挑战即降级。
168    Reqwest,
169    /// 整站强制走浏览器(首请求即被挑战 / 整页 JS 渲染)。
170    Browser,
171}
172
173/// HTTP 配置块。
174#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
175#[serde(rename_all = "camelCase", deny_unknown_fields)]
176#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
177pub struct Http {
178    #[serde(default)]
179    pub headers: HashMap<String, String>,
180    /// 静态 cookie;也是运行时注入 clearance cookie 的落点。
181    #[serde(default)]
182    pub cookies: HashMap<String, String>,
183    /// 先 GET 这些页以预热会话 cookie。
184    #[serde(default)]
185    pub warmup: Vec<String>,
186    #[serde(default)]
187    pub charset: Charset,
188    #[serde(default, skip_serializing_if = "Option::is_none")]
189    pub timeout: Option<u64>,
190    #[serde(default, skip_serializing_if = "Option::is_none")]
191    pub retry: Option<Retry>,
192    #[serde(default, skip_serializing_if = "Option::is_none")]
193    pub rate_limit: Option<RateLimit>,
194    /// 取页模式(auto|reqwest|browser);默认 auto。
195    #[serde(default)]
196    pub fetcher: FetchMode,
197}
198
199/// HTTP 方法。
200#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)]
201#[serde(rename_all = "UPPERCASE")]
202#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
203pub enum Method {
204    #[default]
205    Get,
206    Post,
207}
208
209/// 单个请求。
210#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
211#[serde(rename_all = "camelCase", deny_unknown_fields)]
212#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
213pub struct Request {
214    pub url: UrlOrRule,
215    #[serde(default)]
216    pub method: Method,
217    #[serde(default, skip_serializing_if = "Option::is_none")]
218    pub body: Option<UrlOrRule>,
219    #[serde(default)]
220    pub headers: HashMap<String, String>,
221    /// 命名捕获,供 template 使用。
222    #[serde(default)]
223    pub vars: HashMap<String, Rule>,
224}
225
226// ───────────────────────── 操作规则 ─────────────────────────
227
228/// 一本书的字段抽取规则(均可省略)。
229#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
230#[serde(rename_all = "camelCase", deny_unknown_fields)]
231#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
232pub struct BookRules {
233    /// 列表项:指向书详情页的链接(搜索/浏览结果用;bookInfo 阶段忽略)。
234    #[serde(default, skip_serializing_if = "Option::is_none")]
235    pub book_url: Option<Rule>,
236    #[serde(default, skip_serializing_if = "Option::is_none")]
237    pub name: Option<Rule>,
238    #[serde(default, skip_serializing_if = "Option::is_none")]
239    pub author: Option<Rule>,
240    #[serde(default, skip_serializing_if = "Option::is_none")]
241    pub cover: Option<Rule>,
242    #[serde(default, skip_serializing_if = "Option::is_none")]
243    pub intro: Option<Rule>,
244    #[serde(default, skip_serializing_if = "Option::is_none")]
245    pub kind: Option<Rule>,
246    #[serde(default, skip_serializing_if = "Option::is_none")]
247    pub last_chapter: Option<Rule>,
248    #[serde(default, skip_serializing_if = "Option::is_none")]
249    pub toc_url: Option<Rule>,
250    #[serde(default, skip_serializing_if = "Option::is_none")]
251    pub word_count: Option<Rule>,
252}
253
254/// 搜索操作。
255#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
256#[serde(deny_unknown_fields)]
257#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
258pub struct SearchOp {
259    pub request: Request,
260    pub list: Rule,
261    pub item: BookRules,
262}
263
264/// 浏览分类。
265#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Hash)]
266#[serde(deny_unknown_fields)]
267#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
268pub struct Category {
269    pub title: String,
270    pub url: UrlOrRule,
271}
272
273/// 浏览操作。
274#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
275#[serde(deny_unknown_fields)]
276#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
277pub struct ExploreOp {
278    pub categories: Vec<Category>,
279    pub list: Rule,
280    pub item: BookRules,
281}
282
283fn default_max_pages() -> u32 {
284    100
285}
286
287/// 目录规则(章节 + 分卷 + 可选分页)。
288#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
289#[serde(rename_all = "camelCase", deny_unknown_fields)]
290#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
291pub struct TocRules {
292    pub list: Rule,
293    pub name: Rule,
294    pub url: Rule,
295    #[serde(default, skip_serializing_if = "Option::is_none")]
296    pub is_volume: Option<Rule>,
297    #[serde(default, skip_serializing_if = "Option::is_none")]
298    pub next_page: Option<Rule>,
299    #[serde(default = "default_max_pages")]
300    pub max_pages: u32,
301}
302
303/// 正文规则(可选分页)。
304#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
305#[serde(rename_all = "camelCase", deny_unknown_fields)]
306#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
307pub struct ContentRules {
308    pub value: Rule,
309    #[serde(default, skip_serializing_if = "Option::is_none")]
310    pub next_page: Option<Rule>,
311    #[serde(default = "default_max_pages")]
312    pub max_pages: u32,
313}
314
315// ───────────────────────── 样例 ─────────────────────────
316
317/// 样例期望不变量。
318#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
319#[serde(rename_all = "camelCase")]
320#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
321pub struct Expect {
322    #[serde(default, skip_serializing_if = "Option::is_none")]
323    pub name: Option<String>,
324    #[serde(default, skip_serializing_if = "Option::is_none")]
325    pub min_chapters: Option<usize>,
326    #[serde(default, skip_serializing_if = "Option::is_none")]
327    pub volumes: Option<usize>,
328    #[serde(default, skip_serializing_if = "Option::is_none")]
329    pub min_content_chars: Option<usize>,
330}
331
332/// 黄金样例。
333#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
334#[serde(rename_all = "camelCase", deny_unknown_fields)]
335#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
336pub struct Sample {
337    pub book_url: String,
338    #[serde(default)]
339    pub expect: Expect,
340}
341
342// ───────────────────────── 顶层书源 ─────────────────────────
343
344/// v2 书源。
345#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
346#[serde(rename_all = "camelCase", deny_unknown_fields)]
347#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
348pub struct BookSource {
349    /// 固定为 `"trnovel-booksource/v2"`。
350    pub schema: String,
351    pub name: String,
352    #[serde(default, skip_serializing_if = "String::is_empty")]
353    pub group: String,
354    /// 站点基址,用于相对链接解析与 `{{base}}`。
355    pub url: String,
356    #[serde(default)]
357    pub http: Http,
358    #[serde(default, skip_serializing_if = "Option::is_none")]
359    pub search: Option<SearchOp>,
360    #[serde(default, skip_serializing_if = "Option::is_none")]
361    pub explore: Option<ExploreOp>,
362    pub book_info: BookRules,
363    pub toc: TocRules,
364    pub content: ContentRules,
365    #[serde(default, skip_serializing_if = "Vec::is_empty")]
366    pub samples: Vec<Sample>,
367}
368
369/// 期望的 schema 标识。
370pub const SCHEMA_ID: &str = "trnovel-booksource/v2";
371
372impl BookSource {
373    /// 从 JSON 字符串解析一个书源。
374    pub fn from_json(s: &str) -> Result<Self, ConfigError> {
375        Ok(serde_json::from_str(s)?)
376    }
377
378    /// 从 JSON 值解析一个或多个书源(支持单对象或数组)。
379    pub fn from_value_many(value: serde_json::Value) -> Result<Vec<Self>, ConfigError> {
380        if value.is_array() {
381            Ok(serde_json::from_value(value)?)
382        } else {
383            Ok(vec![serde_json::from_value(value)?])
384        }
385    }
386
387    /// 从本地文件导入(支持单对象或数组)。
388    pub fn from_path(path: &str) -> Result<Vec<Self>, super::error::BookSourceError> {
389        let text = std::fs::read_to_string(path).map_err(ConfigError::Io)?;
390        let value = serde_json::from_str(&text).map_err(ConfigError::Json)?;
391        Ok(Self::from_value_many(value)?)
392    }
393
394    /// 从网络 URL 导入(支持单对象或数组)。
395    pub async fn from_url(url: &str) -> Result<Vec<Self>, super::error::BookSourceError> {
396        use super::error::FetchError;
397        let text = reqwest::get(url)
398            .await
399            .map_err(FetchError::Http)?
400            // 先判 HTTP 状态:4xx/5xx 返回错误页时,避免把"非 JSON"误报成 JSON 解析失败。
401            .error_for_status()
402            .map_err(FetchError::Http)?
403            .text()
404            .await
405            .map_err(FetchError::Http)?;
406        let value = serde_json::from_str(&text).map_err(ConfigError::Json)?;
407        Ok(Self::from_value_many(value)?)
408    }
409}
410
411#[cfg(test)]
412mod tests {
413    use super::*;
414
415    /// 与 examples/bilixs.v2.json 同构的代表性书源(覆盖 leaf / firstOf / concat /
416    /// template / attr / http+cookies / samples / 分卷 isVolume)。
417    const BILIXS_V2: &str = r#"{
418      "schema": "trnovel-booksource/v2",
419      "name": "哔哩小说",
420      "group": "测试",
421      "url": "https://www.bilixs.com",
422      "http": {
423        "headers": { "User-Agent": "Mozilla/5.0" },
424        "cookies": {},
425        "warmup": ["https://www.bilixs.com/"],
426        "charset": "auto",
427        "timeout": 15000,
428        "retry": { "max": 2, "backoffMs": 500 }
429      },
430      "search": {
431        "request": { "url": { "template": "{{base}}/search.html?searchkey={{key}}" }, "method": "GET" },
432        "list": { "via": "css", "select": ".module-item" },
433        "item": {
434          "name": { "via": "css", "select": ".module-item-title", "extract": "text" },
435          "tocUrl": { "via": "css", "select": ".module-item-title", "extract": { "attr": "href" } }
436        }
437      },
438      "explore": {
439        "categories": [ { "title": "最近更新", "url": { "template": "{{base}}/book/lastupdate_0_1_0_0_0_0_0_{{page}}_0.html" } } ],
440        "list": { "via": "css", "select": ".module-item" },
441        "item": { "name": { "via": "css", "select": ".module-item-title", "extract": "text" } }
442      },
443      "bookInfo": {
444        "name": { "via": "css", "select": "[property=\"og:novel:book_name\"]", "extract": { "attr": "content" } },
445        "cover": { "via": "css", "select": "[property=\"og:image\"]", "extract": { "attr": "content" } },
446        "kind": { "concat": [
447            { "via": "css", "select": "[property=\"og:novel:tags\"]", "extract": { "attr": "content" } },
448            { "via": "css", "select": "[property=\"og:novel:status\"]", "extract": { "attr": "content" } }
449          ], "join": " · " },
450        "tocUrl": { "via": "css", "select": "[property=\"og:novel:read_url\"]", "extract": { "attr": "content" } }
451      },
452      "toc": {
453        "list": { "via": "css", "select": ".box > h2.module-title.type, .box a.module-row-text" },
454        "name": { "firstOf": [
455            { "via": "css", "select": ".module-row-title", "extract": "text" },
456            { "via": "css", "select": "h2", "extract": "text" }
457          ] },
458        "url": { "via": "css", "select": "a", "extract": { "attr": "href" } },
459        "isVolume": { "via": "css", "select": "h2", "extract": "text" },
460        "maxPages": 1
461      },
462      "content": {
463        "value": { "via": "css", "select": ".article-content", "extract": "html",
464          "clean": [ { "regex": "请收藏本站[^<\\n]*", "replace": "" }, { "trim": true } ] }
465      },
466      "samples": [
467        { "bookUrl": "/novel/guzhenren.html", "expect": { "name": "蛊真人", "volumes": 8, "minChapters": 2000 } }
468      ]
469    }"#;
470
471    #[test]
472    fn parses_v2_book_source() {
473        let bs = BookSource::from_json(BILIXS_V2).expect("应解析 v2 书源");
474        assert_eq!(bs.schema, SCHEMA_ID);
475        assert_eq!(bs.name, "哔哩小说");
476    }
477
478    #[test]
479    fn toc_name_is_firstof_with_two_leaves() {
480        let bs = BookSource::from_json(BILIXS_V2).unwrap();
481        match &bs.toc.name {
482            Rule::FirstOf { first_of } => assert_eq!(first_of.len(), 2),
483            other => panic!("toc.name 应为 firstOf,实际 {other:?}"),
484        }
485    }
486
487    #[test]
488    fn toc_is_volume_is_leaf_css_h2() {
489        let bs = BookSource::from_json(BILIXS_V2).unwrap();
490        let iv = bs.toc.is_volume.as_ref().expect("isVolume 应存在");
491        match iv {
492            Rule::Leaf(l) => {
493                assert_eq!(l.via, Via::Css);
494                assert_eq!(l.select.as_deref(), Some("h2"));
495            }
496            other => panic!("isVolume 应为叶子,实际 {other:?}"),
497        }
498    }
499
500    #[test]
501    fn search_url_is_template_rule() {
502        let bs = BookSource::from_json(BILIXS_V2).unwrap();
503        let req = &bs.search.as_ref().unwrap().request;
504        match &req.url {
505            UrlOrRule::Rule(r) => assert!(matches!(**r, Rule::Template { .. })),
506            other => panic!("search.request.url 应为模板规则,实际 {other:?}"),
507        }
508    }
509
510    #[test]
511    fn book_info_cover_extracts_attr() {
512        let bs = BookSource::from_json(BILIXS_V2).unwrap();
513        match bs.book_info.cover.as_ref().unwrap() {
514            Rule::Leaf(l) => assert_eq!(
515                l.extract,
516                Extract::Attr {
517                    attr: "content".into()
518                }
519            ),
520            other => panic!("cover 应为属性抽取叶子,实际 {other:?}"),
521        }
522    }
523
524    #[test]
525    fn http_cookies_and_warmup_parsed() {
526        let bs = BookSource::from_json(BILIXS_V2).unwrap();
527        assert_eq!(bs.http.warmup, vec!["https://www.bilixs.com/"]);
528        assert_eq!(bs.http.charset, Charset::Auto);
529        assert_eq!(bs.http.retry.as_ref().unwrap().backoff_ms, 500);
530    }
531
532    #[test]
533    fn sample_expectations_parsed() {
534        let bs = BookSource::from_json(BILIXS_V2).unwrap();
535        let s = &bs.samples[0];
536        assert_eq!(s.expect.volumes, Some(8));
537        assert_eq!(s.expect.min_chapters, Some(2000));
538    }
539
540    #[test]
541    fn round_trips_through_json() {
542        let bs = BookSource::from_json(BILIXS_V2).unwrap();
543        let json = serde_json::to_string(&bs).unwrap();
544        let bs2 = BookSource::from_json(&json).unwrap();
545        assert_eq!(bs, bs2);
546    }
547
548    #[test]
549    fn rejects_unknown_top_level_field() {
550        let bad = BILIXS_V2.replacen("\"name\":", "\"nmae\":", 1);
551        assert!(
552            BookSource::from_json(&bad).is_err(),
553            "拼错字段应被 deny_unknown_fields 拒绝"
554        );
555    }
556}
557
558/// 防漂移:`book-source.schema.json` 必须等于从类型现生成的 schema(`--features schema`)。
559/// 失败说明改了配置类型却没重新生成 schema——按提示重跑 gen_schema 即可。
560#[cfg(all(test, feature = "schema"))]
561mod schema_sync {
562    #[test]
563    fn schema_is_in_sync() {
564        let generated =
565            serde_json::to_string_pretty(&schemars::schema_for!(crate::BookSource)).unwrap();
566        let committed = include_str!("../book-source.schema.json");
567        assert_eq!(
568            generated.trim(),
569            committed.trim(),
570            "book-source.schema.json 与配置类型不同步;请重新生成:\n  \
571             cargo run -p parse-book-source --features schema --example gen_schema \
572             > crates/parse-book-source/book-source.schema.json"
573        );
574    }
575}