Skip to main content

parse_book_source/
source.rs

1//! v2 书源配置类型(纯 serde,镜像 `book-source.schema.json`)。
2//!
3//! 规则是显式结构化对象,无任何紧凑字符串 DSL。`Rule` 既是配置、也是供求值器
4//! 遍历的语法树(见 design D1/D6)。
5
6use super::error::ConfigError;
7use serde::{Deserialize, Serialize};
8use std::collections::HashMap;
9
10// ───────────────────────── 规则 AST ─────────────────────────
11
12/// 抽取后端(决定 `select` 的语义)。
13#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize, Hash)]
14#[serde(rename_all = "lowercase")]
15#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
16pub enum Via {
17    #[default]
18    Css,
19    Xpath,
20    Json,
21    Regex,
22    /// 直接使用当前上下文值(只跑 clean)。
23    Raw,
24}
25
26/// 取值方式(枚举字符串 或 `{ "attr": "..." }`)。
27#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Hash)]
28#[serde(untagged)]
29#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
30pub enum Extract {
31    Op(ExtractOp),
32    Attr { attr: String },
33}
34
35impl Default for Extract {
36    fn default() -> Self {
37        Extract::Op(ExtractOp::Text)
38    }
39}
40
41/// 文本/HTML 取值算子。
42#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize, Hash)]
43#[serde(rename_all = "camelCase")]
44#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
45pub enum ExtractOp {
46    #[default]
47    Text,
48    OwnText,
49    Html,
50    InnerHtml,
51    OuterHtml,
52}
53
54/// 编解码方式(`decode`/`encode` 算子,以及 crypto 的字节↔串编码)。
55#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Hash)]
56#[serde(rename_all = "camelCase")]
57#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
58pub enum Codec {
59    Base64,
60    Base64url,
61    Hex,
62    /// URL 百分号编解码。
63    Url,
64}
65
66/// crypto 的 key/iv/输入/输出字节编码。
67#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize, Hash)]
68#[serde(rename_all = "lowercase")]
69#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
70pub enum ByteEnc {
71    #[default]
72    Utf8,
73    Base64,
74    Hex,
75    /// 原样字节(等同 utf8 字节,主要用于输入密文已是裸字节串的场景)。
76    Raw,
77}
78
79/// 哈希算法。
80#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Hash)]
81#[serde(rename_all = "lowercase")]
82#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
83pub enum HashAlgo {
84    Md5,
85    Sha1,
86    Sha256,
87    Sha512,
88}
89
90/// 哈希/HMAC 输出编码。
91#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize, Hash)]
92#[serde(rename_all = "lowercase")]
93#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
94pub enum HashOut {
95    #[default]
96    Hex,
97    Base64,
98}
99
100/// 哈希算子(可选 HMAC)。
101#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Hash)]
102#[serde(rename_all = "camelCase", deny_unknown_fields)]
103#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
104pub struct HashStep {
105    pub algo: HashAlgo,
106    #[serde(default)]
107    pub output: HashOut,
108    /// 提供则计算 HMAC(以此为密钥)。
109    #[serde(default, skip_serializing_if = "Option::is_none")]
110    pub hmac_key: Option<String>,
111    #[serde(default)]
112    pub hmac_key_enc: ByteEnc,
113}
114
115/// 对称加密算法。
116#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Hash)]
117#[serde(rename_all = "camelCase")]
118#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
119pub enum CipherAlgo {
120    Aes,
121    Des,
122    TripleDes,
123}
124
125/// 加密模式。
126#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Hash)]
127#[serde(rename_all = "lowercase")]
128#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
129pub enum CipherMode {
130    Cbc,
131    Ecb,
132    Cfb,
133    Gcm,
134}
135
136/// 填充方式(gcm 忽略)。
137#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize, Hash)]
138#[serde(rename_all = "lowercase")]
139#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
140pub enum Padding {
141    #[default]
142    Pkcs7,
143    Zero,
144    None,
145}
146
147/// 加解密方向。
148#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize, Hash)]
149#[serde(rename_all = "lowercase")]
150#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
151pub enum CipherOp {
152    #[default]
153    Decrypt,
154    Encrypt,
155}
156
157/// 加解密算子。默认值贴合「解密正文」主场景:`op=decrypt`、`inputEnc=base64`、`outputEnc=utf8`。
158#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Hash)]
159#[serde(rename_all = "camelCase", deny_unknown_fields)]
160#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
161pub struct CipherStep {
162    pub algo: CipherAlgo,
163    pub mode: CipherMode,
164    #[serde(default)]
165    pub padding: Padding,
166    #[serde(default)]
167    pub op: CipherOp,
168    pub key: String,
169    #[serde(default)]
170    pub key_enc: ByteEnc,
171    #[serde(default, skip_serializing_if = "Option::is_none")]
172    pub iv: Option<String>,
173    #[serde(default)]
174    pub iv_enc: ByteEnc,
175    /// 入参密文串→字节;省略时按 `op` 取默认(decrypt→base64,encrypt→utf8)。
176    #[serde(default, skip_serializing_if = "Option::is_none")]
177    pub input_enc: Option<ByteEnc>,
178    /// 结果字节→串;省略时按 `op` 取默认(decrypt→utf8,encrypt→base64)。
179    #[serde(default, skip_serializing_if = "Option::is_none")]
180    pub output_enc: Option<ByteEnc>,
181}
182
183/// 繁简转换方向。
184#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Hash)]
185#[serde(rename_all = "lowercase")]
186#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
187pub enum CnConvert {
188    /// 繁体 → 简体。
189    T2s,
190    /// 简体 → 繁体。
191    S2t,
192}
193
194/// 单步后处理。步内多算子按固定顺序执行:
195/// `regex/replace → trim → prepend → append → decode → encode → hash → cipher → fontMap → cn`。
196#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize, Hash)]
197#[serde(deny_unknown_fields)]
198#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
199pub struct CleanStep {
200    #[serde(default, skip_serializing_if = "Option::is_none")]
201    pub regex: Option<String>,
202    #[serde(default, skip_serializing_if = "Option::is_none")]
203    pub replace: Option<String>,
204    #[serde(default, skip_serializing_if = "Option::is_none")]
205    pub trim: Option<bool>,
206    #[serde(default, skip_serializing_if = "Option::is_none")]
207    pub prepend: Option<String>,
208    #[serde(default, skip_serializing_if = "Option::is_none")]
209    pub append: Option<String>,
210    /// 解码(base64/base64url/hex/url)。
211    #[serde(default, skip_serializing_if = "Option::is_none")]
212    pub decode: Option<Codec>,
213    /// 编码(base64/base64url/hex/url)。
214    #[serde(default, skip_serializing_if = "Option::is_none")]
215    pub encode: Option<Codec>,
216    /// 哈希/HMAC。
217    #[serde(default, skip_serializing_if = "Option::is_none")]
218    pub hash: Option<HashStep>,
219    /// 对称加解密。
220    #[serde(default, skip_serializing_if = "Option::is_none")]
221    pub cipher: Option<CipherStep>,
222    /// 字体反爬还原:私有区(PUA)字符按映射表换回真字。键为码点十六进制(如 `"E4DE"` 或 `"U+E4DE"`),
223    /// 值为目标字符;表外字符原样保留。用于番茄等「自定义字体 + PUA」反爬站点——表是数据,由书源内联
224    /// 提供(引擎不内置任何站点的表),可用 `trn gen-fontmap` 生成。
225    #[serde(default, rename = "fontMap", skip_serializing_if = "Option::is_none")]
226    pub font_map: Option<std::collections::BTreeMap<String, String>>,
227    /// 繁简转换。
228    #[serde(default, skip_serializing_if = "Option::is_none")]
229    pub cn: Option<CnConvert>,
230    /// JS 后处理(逃生舱;脚本里以当前串为 `result`)。需启用 `js` feature。
231    #[serde(default, skip_serializing_if = "Option::is_none")]
232    pub js: Option<String>,
233}
234
235/// 叶子规则:在当前上下文做一次抽取。
236#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize, Hash)]
237#[serde(rename_all = "camelCase")]
238#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
239pub struct LeafRule {
240    #[serde(default)]
241    pub via: Via,
242    #[serde(default, skip_serializing_if = "Option::is_none")]
243    pub select: Option<String>,
244    #[serde(default, skip_serializing_if = "Option::is_none")]
245    pub index: Option<i64>,
246    #[serde(default)]
247    pub extract: Extract,
248    #[serde(default, skip_serializing_if = "Vec::is_empty")]
249    pub clean: Vec<CleanStep>,
250}
251
252/// 一条规则:叶子,或组合子。组合子按其唯一键判别(见 design D1)。
253///
254/// 反序列化时按变体顺序尝试:组合子(各有唯一必填键)在前,叶子兜底。
255#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Hash)]
256#[serde(untagged)]
257#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
258pub enum Rule {
259    /// 取首个非空子规则结果(回退/自愈)。
260    FirstOf {
261        #[serde(rename = "firstOf")]
262        first_of: Vec<Rule>,
263    },
264    /// 拼接非空子规则结果。
265    Concat {
266        concat: Vec<Rule>,
267        #[serde(default)]
268        join: String,
269    },
270    /// 字面量。
271    Literal { literal: String },
272    /// 模板插值(`{{key}}`/`{{page}}`/命名变量)。
273    Template { template: String },
274    /// JS 逻辑编排逃生舱(值规则):以当前上下文为 `result`、注入 `baseUrl`/变量 + `crypto`
275    /// 助手求值,返回字符串。求值需启用 `js` feature(否则返回 `Unsupported("js")`)。
276    /// 必须置于 `Leaf` 之前——`js` 是其唯一判别键,否则会被全可选的 `Leaf` 吞掉。
277    Js { js: String },
278    /// 叶子(兜底)。
279    Leaf(LeafRule),
280}
281
282/// URL 字段:可为字符串模板,或一条规则。
283#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Hash)]
284#[serde(untagged)]
285#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
286pub enum UrlOrRule {
287    Str(String),
288    Rule(Box<Rule>),
289}
290
291// ───────────────────────── HTTP / 请求 ─────────────────────────
292
293/// 字符集。
294#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)]
295#[serde(rename_all = "kebab-case")]
296#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
297pub enum Charset {
298    #[default]
299    Auto,
300    Utf8,
301    Gbk,
302    Gb18030,
303    Big5,
304}
305
306/// 重试策略。
307#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
308#[serde(rename_all = "camelCase", deny_unknown_fields)]
309#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
310pub struct Retry {
311    #[serde(default)]
312    pub max: u32,
313    #[serde(default)]
314    pub backoff_ms: u64,
315}
316
317/// 速率限制。
318#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
319#[serde(rename_all = "camelCase", deny_unknown_fields)]
320#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
321pub struct RateLimit {
322    pub max_count: u64,
323    pub per_ms: u64,
324}
325
326/// 取页模式:是否动用浏览器解反爬挑战。
327/// 真正是否开浏览器还需 app/用户级授权(两级取交集,见 OpenSpec change `browser-fetcher` D12)。
328#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)]
329#[serde(rename_all = "lowercase")]
330#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
331pub enum FetchMode {
332    /// 默认:平时 reqwest,撞挑战才升级浏览器。
333    #[default]
334    Auto,
335    /// 永不开浏览器,撞挑战即降级。
336    Reqwest,
337    /// 整站强制走浏览器(首请求即被挑战 / 整页 JS 渲染)。
338    Browser,
339}
340
341/// HTTP 配置块。
342#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
343#[serde(rename_all = "camelCase", deny_unknown_fields)]
344#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
345pub struct Http {
346    #[serde(default)]
347    pub headers: HashMap<String, String>,
348    /// 静态 cookie;也是运行时注入 clearance cookie 的落点。
349    #[serde(default)]
350    pub cookies: HashMap<String, String>,
351    /// 先 GET 这些页以预热会话 cookie。
352    #[serde(default)]
353    pub warmup: Vec<String>,
354    #[serde(default)]
355    pub charset: Charset,
356    #[serde(default, skip_serializing_if = "Option::is_none")]
357    pub timeout: Option<u64>,
358    #[serde(default, skip_serializing_if = "Option::is_none")]
359    pub retry: Option<Retry>,
360    #[serde(default, skip_serializing_if = "Option::is_none")]
361    pub rate_limit: Option<RateLimit>,
362    /// 取页模式(auto|reqwest|browser);默认 auto。
363    #[serde(default)]
364    pub fetcher: FetchMode,
365}
366
367/// HTTP 方法。
368#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)]
369#[serde(rename_all = "UPPERCASE")]
370#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
371pub enum Method {
372    #[default]
373    Get,
374    Post,
375}
376
377/// 单个请求。
378#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
379#[serde(rename_all = "camelCase", deny_unknown_fields)]
380#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
381pub struct Request {
382    pub url: UrlOrRule,
383    #[serde(default)]
384    pub method: Method,
385    #[serde(default, skip_serializing_if = "Option::is_none")]
386    pub body: Option<UrlOrRule>,
387    #[serde(default)]
388    pub headers: HashMap<String, String>,
389    /// 命名捕获,供 template 使用。
390    #[serde(default)]
391    pub vars: HashMap<String, Rule>,
392}
393
394// ───────────────────────── 操作规则 ─────────────────────────
395
396/// 一本书的字段抽取规则(均可省略)。
397#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
398#[serde(rename_all = "camelCase", deny_unknown_fields)]
399#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
400pub struct BookRules {
401    /// 列表项:指向书详情页的链接(搜索/浏览结果用;bookInfo 阶段忽略)。
402    #[serde(default, skip_serializing_if = "Option::is_none")]
403    pub book_url: Option<Rule>,
404    #[serde(default, skip_serializing_if = "Option::is_none")]
405    pub name: Option<Rule>,
406    #[serde(default, skip_serializing_if = "Option::is_none")]
407    pub author: Option<Rule>,
408    #[serde(default, skip_serializing_if = "Option::is_none")]
409    pub cover: Option<Rule>,
410    #[serde(default, skip_serializing_if = "Option::is_none")]
411    pub intro: Option<Rule>,
412    #[serde(default, skip_serializing_if = "Option::is_none")]
413    pub kind: Option<Rule>,
414    #[serde(default, skip_serializing_if = "Option::is_none")]
415    pub last_chapter: Option<Rule>,
416    #[serde(default, skip_serializing_if = "Option::is_none")]
417    pub toc_url: Option<Rule>,
418    #[serde(default, skip_serializing_if = "Option::is_none")]
419    pub word_count: Option<Rule>,
420}
421
422/// 搜索操作。
423#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
424#[serde(deny_unknown_fields)]
425#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
426pub struct SearchOp {
427    pub request: Request,
428    pub list: Rule,
429    pub item: BookRules,
430}
431
432/// 浏览分类。
433#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Hash)]
434#[serde(deny_unknown_fields)]
435#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
436pub struct Category {
437    pub title: String,
438    pub url: UrlOrRule,
439}
440
441/// 浏览操作。
442#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
443#[serde(deny_unknown_fields)]
444#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
445pub struct ExploreOp {
446    pub categories: Vec<Category>,
447    pub list: Rule,
448    pub item: BookRules,
449}
450
451fn default_max_pages() -> u32 {
452    100
453}
454
455/// 目录规则(章节 + 分卷 + 可选分页)。
456#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
457#[serde(rename_all = "camelCase", deny_unknown_fields)]
458#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
459pub struct TocRules {
460    pub list: Rule,
461    pub name: Rule,
462    pub url: Rule,
463    #[serde(default, skip_serializing_if = "Option::is_none")]
464    pub is_volume: Option<Rule>,
465    #[serde(default, skip_serializing_if = "Option::is_none")]
466    pub next_page: Option<Rule>,
467    #[serde(default = "default_max_pages")]
468    pub max_pages: u32,
469}
470
471/// 正文规则(可选分页)。
472#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
473#[serde(rename_all = "camelCase", deny_unknown_fields)]
474#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
475pub struct ContentRules {
476    pub value: Rule,
477    #[serde(default, skip_serializing_if = "Option::is_none")]
478    pub next_page: Option<Rule>,
479    #[serde(default = "default_max_pages")]
480    pub max_pages: u32,
481}
482
483// ───────────────────────── 样例 ─────────────────────────
484
485/// 样例期望不变量。
486#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
487#[serde(rename_all = "camelCase")]
488#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
489pub struct Expect {
490    #[serde(default, skip_serializing_if = "Option::is_none")]
491    pub name: Option<String>,
492    #[serde(default, skip_serializing_if = "Option::is_none")]
493    pub min_chapters: Option<usize>,
494    #[serde(default, skip_serializing_if = "Option::is_none")]
495    pub volumes: Option<usize>,
496    #[serde(default, skip_serializing_if = "Option::is_none")]
497    pub min_content_chars: Option<usize>,
498}
499
500/// 黄金样例。
501#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
502#[serde(rename_all = "camelCase", deny_unknown_fields)]
503#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
504pub struct Sample {
505    pub book_url: String,
506    #[serde(default)]
507    pub expect: Expect,
508}
509
510// ───────────────────────── 顶层书源 ─────────────────────────
511
512/// v2 书源。
513#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
514#[serde(rename_all = "camelCase", deny_unknown_fields)]
515#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
516pub struct BookSource {
517    /// 固定为 `"trnovel-booksource/v2"`。
518    pub schema: String,
519    pub name: String,
520    #[serde(default, skip_serializing_if = "String::is_empty")]
521    pub group: String,
522    /// 站点基址,用于相对链接解析与 `{{base}}`。
523    pub url: String,
524    #[serde(default)]
525    pub http: Http,
526    #[serde(default, skip_serializing_if = "Option::is_none")]
527    pub search: Option<SearchOp>,
528    #[serde(default, skip_serializing_if = "Option::is_none")]
529    pub explore: Option<ExploreOp>,
530    pub book_info: BookRules,
531    pub toc: TocRules,
532    pub content: ContentRules,
533    #[serde(default, skip_serializing_if = "Vec::is_empty")]
534    pub samples: Vec<Sample>,
535}
536
537/// 期望的 schema 标识。
538pub const SCHEMA_ID: &str = "trnovel-booksource/v2";
539
540impl BookSource {
541    /// 从 JSON 字符串解析一个书源。
542    pub fn from_json(s: &str) -> Result<Self, ConfigError> {
543        Ok(serde_json::from_str(s)?)
544    }
545
546    /// 从 JSON 值解析一个或多个书源(支持单对象或数组)。
547    pub fn from_value_many(value: serde_json::Value) -> Result<Vec<Self>, ConfigError> {
548        if value.is_array() {
549            Ok(serde_json::from_value(value)?)
550        } else {
551            Ok(vec![serde_json::from_value(value)?])
552        }
553    }
554
555    /// 从本地文件导入(支持单对象或数组)。
556    pub fn from_path(path: &str) -> Result<Vec<Self>, super::error::BookSourceError> {
557        let text = std::fs::read_to_string(path).map_err(ConfigError::Io)?;
558        let value = serde_json::from_str(&text).map_err(ConfigError::Json)?;
559        Ok(Self::from_value_many(value)?)
560    }
561
562    /// 从网络 URL 导入(支持单对象或数组)。
563    pub async fn from_url(url: &str) -> Result<Vec<Self>, super::error::BookSourceError> {
564        use super::error::FetchError;
565        let text = reqwest::get(url)
566            .await
567            .map_err(FetchError::Http)?
568            // 先判 HTTP 状态:4xx/5xx 返回错误页时,避免把"非 JSON"误报成 JSON 解析失败。
569            .error_for_status()
570            .map_err(FetchError::Http)?
571            .text()
572            .await
573            .map_err(FetchError::Http)?;
574        let value = serde_json::from_str(&text).map_err(ConfigError::Json)?;
575        Ok(Self::from_value_many(value)?)
576    }
577}
578
579#[cfg(test)]
580mod tests {
581    use super::*;
582
583    /// 与 examples/bilixs.v2.json 同构的代表性书源(覆盖 leaf / firstOf / concat /
584    /// template / attr / http+cookies / samples / 分卷 isVolume)。
585    const BILIXS_V2: &str = r#"{
586      "schema": "trnovel-booksource/v2",
587      "name": "哔哩小说",
588      "group": "测试",
589      "url": "https://www.bilixs.com",
590      "http": {
591        "headers": { "User-Agent": "Mozilla/5.0" },
592        "cookies": {},
593        "warmup": ["https://www.bilixs.com/"],
594        "charset": "auto",
595        "timeout": 15000,
596        "retry": { "max": 2, "backoffMs": 500 }
597      },
598      "search": {
599        "request": { "url": { "template": "{{base}}/search.html?searchkey={{key}}" }, "method": "GET" },
600        "list": { "via": "css", "select": ".module-item" },
601        "item": {
602          "name": { "via": "css", "select": ".module-item-title", "extract": "text" },
603          "tocUrl": { "via": "css", "select": ".module-item-title", "extract": { "attr": "href" } }
604        }
605      },
606      "explore": {
607        "categories": [ { "title": "最近更新", "url": { "template": "{{base}}/book/lastupdate_0_1_0_0_0_0_0_{{page}}_0.html" } } ],
608        "list": { "via": "css", "select": ".module-item" },
609        "item": { "name": { "via": "css", "select": ".module-item-title", "extract": "text" } }
610      },
611      "bookInfo": {
612        "name": { "via": "css", "select": "[property=\"og:novel:book_name\"]", "extract": { "attr": "content" } },
613        "cover": { "via": "css", "select": "[property=\"og:image\"]", "extract": { "attr": "content" } },
614        "kind": { "concat": [
615            { "via": "css", "select": "[property=\"og:novel:tags\"]", "extract": { "attr": "content" } },
616            { "via": "css", "select": "[property=\"og:novel:status\"]", "extract": { "attr": "content" } }
617          ], "join": " · " },
618        "tocUrl": { "via": "css", "select": "[property=\"og:novel:read_url\"]", "extract": { "attr": "content" } }
619      },
620      "toc": {
621        "list": { "via": "css", "select": ".box > h2.module-title.type, .box a.module-row-text" },
622        "name": { "firstOf": [
623            { "via": "css", "select": ".module-row-title", "extract": "text" },
624            { "via": "css", "select": "h2", "extract": "text" }
625          ] },
626        "url": { "via": "css", "select": "a", "extract": { "attr": "href" } },
627        "isVolume": { "via": "css", "select": "h2", "extract": "text" },
628        "maxPages": 1
629      },
630      "content": {
631        "value": { "via": "css", "select": ".article-content", "extract": "html",
632          "clean": [ { "regex": "请收藏本站[^<\\n]*", "replace": "" }, { "trim": true } ] }
633      },
634      "samples": [
635        { "bookUrl": "/novel/guzhenren.html", "expect": { "name": "蛊真人", "volumes": 8, "minChapters": 2000 } }
636      ]
637    }"#;
638
639    #[test]
640    fn parses_v2_book_source() {
641        let bs = BookSource::from_json(BILIXS_V2).expect("应解析 v2 书源");
642        assert_eq!(bs.schema, SCHEMA_ID);
643        assert_eq!(bs.name, "哔哩小说");
644    }
645
646    #[test]
647    fn toc_name_is_firstof_with_two_leaves() {
648        let bs = BookSource::from_json(BILIXS_V2).unwrap();
649        match &bs.toc.name {
650            Rule::FirstOf { first_of } => assert_eq!(first_of.len(), 2),
651            other => panic!("toc.name 应为 firstOf,实际 {other:?}"),
652        }
653    }
654
655    #[test]
656    fn toc_is_volume_is_leaf_css_h2() {
657        let bs = BookSource::from_json(BILIXS_V2).unwrap();
658        let iv = bs.toc.is_volume.as_ref().expect("isVolume 应存在");
659        match iv {
660            Rule::Leaf(l) => {
661                assert_eq!(l.via, Via::Css);
662                assert_eq!(l.select.as_deref(), Some("h2"));
663            }
664            other => panic!("isVolume 应为叶子,实际 {other:?}"),
665        }
666    }
667
668    #[test]
669    fn search_url_is_template_rule() {
670        let bs = BookSource::from_json(BILIXS_V2).unwrap();
671        let req = &bs.search.as_ref().unwrap().request;
672        match &req.url {
673            UrlOrRule::Rule(r) => assert!(matches!(**r, Rule::Template { .. })),
674            other => panic!("search.request.url 应为模板规则,实际 {other:?}"),
675        }
676    }
677
678    #[test]
679    fn book_info_cover_extracts_attr() {
680        let bs = BookSource::from_json(BILIXS_V2).unwrap();
681        match bs.book_info.cover.as_ref().unwrap() {
682            Rule::Leaf(l) => assert_eq!(
683                l.extract,
684                Extract::Attr {
685                    attr: "content".into()
686                }
687            ),
688            other => panic!("cover 应为属性抽取叶子,实际 {other:?}"),
689        }
690    }
691
692    #[test]
693    fn http_cookies_and_warmup_parsed() {
694        let bs = BookSource::from_json(BILIXS_V2).unwrap();
695        assert_eq!(bs.http.warmup, vec!["https://www.bilixs.com/"]);
696        assert_eq!(bs.http.charset, Charset::Auto);
697        assert_eq!(bs.http.retry.as_ref().unwrap().backoff_ms, 500);
698    }
699
700    #[test]
701    fn sample_expectations_parsed() {
702        let bs = BookSource::from_json(BILIXS_V2).unwrap();
703        let s = &bs.samples[0];
704        assert_eq!(s.expect.volumes, Some(8));
705        assert_eq!(s.expect.min_chapters, Some(2000));
706    }
707
708    #[test]
709    fn round_trips_through_json() {
710        let bs = BookSource::from_json(BILIXS_V2).unwrap();
711        let json = serde_json::to_string(&bs).unwrap();
712        let bs2 = BookSource::from_json(&json).unwrap();
713        assert_eq!(bs, bs2);
714    }
715
716    #[test]
717    fn rejects_unknown_top_level_field() {
718        let bad = BILIXS_V2.replacen("\"name\":", "\"nmae\":", 1);
719        assert!(
720            BookSource::from_json(&bad).is_err(),
721            "拼错字段应被 deny_unknown_fields 拒绝"
722        );
723    }
724}
725
726/// 防漂移:`book-source.schema.json` 必须等于从类型现生成的 schema(`--features schema`)。
727/// 失败说明改了配置类型却没重新生成 schema——按提示重跑 gen_schema 即可。
728#[cfg(all(test, feature = "schema"))]
729mod schema_sync {
730    #[test]
731    fn schema_is_in_sync() {
732        let generated =
733            serde_json::to_string_pretty(&schemars::schema_for!(crate::BookSource)).unwrap();
734        let committed = include_str!("../book-source.schema.json");
735        assert_eq!(
736            generated.trim(),
737            committed.trim(),
738            "book-source.schema.json 与配置类型不同步;请重新生成:\n  \
739             cargo run -p parse-book-source --features schema --example gen_schema \
740             > crates/parse-book-source/book-source.schema.json"
741        );
742    }
743}