Skip to main content

parse_book_source/
source.rs

1//! v2 书源配置类型(纯 serde,镜像 `book-source.schema.json`)。
2//!
3//! 规则是显式结构化对象,无任何紧凑字符串 DSL。`Rule` 既是配置、也是供求值器
4//! 遍历的语法树(见 design D1/D6)。
5
6use super::error::ConfigError;
7use serde::{Deserialize, Serialize};
8use std::collections::HashMap;
9
10// ───────────────────────── 规则 AST ─────────────────────────
11
12/// 抽取后端(决定 `select` 的语义)。
13#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize, Hash)]
14#[serde(rename_all = "lowercase")]
15#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
16pub enum Via {
17    #[default]
18    Css,
19    Xpath,
20    Json,
21    Regex,
22    /// 直接使用当前上下文值(只跑 clean)。
23    Raw,
24}
25
26/// 取值方式(枚举字符串 或 `{ "attr": "..." }`)。
27#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Hash)]
28#[serde(untagged)]
29#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
30pub enum Extract {
31    Op(ExtractOp),
32    Attr { attr: String },
33}
34
35impl Default for Extract {
36    fn default() -> Self {
37        Extract::Op(ExtractOp::Text)
38    }
39}
40
41/// 文本/HTML 取值算子。
42#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize, Hash)]
43#[serde(rename_all = "camelCase")]
44#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
45pub enum ExtractOp {
46    #[default]
47    Text,
48    OwnText,
49    Html,
50    InnerHtml,
51    OuterHtml,
52}
53
54/// 编解码方式(`decode`/`encode` 算子,以及 crypto 的字节↔串编码)。
55#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Hash)]
56#[serde(rename_all = "camelCase")]
57#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
58pub enum Codec {
59    Base64,
60    Base64url,
61    Hex,
62    /// URL 百分号编解码。
63    Url,
64}
65
66/// crypto 的 key/iv/输入/输出字节编码。
67#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize, Hash)]
68#[serde(rename_all = "lowercase")]
69#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
70pub enum ByteEnc {
71    #[default]
72    Utf8,
73    Base64,
74    Hex,
75    /// 原样字节(等同 utf8 字节,主要用于输入密文已是裸字节串的场景)。
76    Raw,
77}
78
79/// 哈希算法。
80#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Hash)]
81#[serde(rename_all = "lowercase")]
82#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
83pub enum HashAlgo {
84    Md5,
85    Sha1,
86    Sha256,
87    Sha512,
88}
89
90/// 哈希/HMAC 输出编码。
91#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize, Hash)]
92#[serde(rename_all = "lowercase")]
93#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
94pub enum HashOut {
95    #[default]
96    Hex,
97    Base64,
98}
99
100/// 哈希算子(可选 HMAC)。
101#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Hash)]
102#[serde(rename_all = "camelCase", deny_unknown_fields)]
103#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
104pub struct HashStep {
105    pub algo: HashAlgo,
106    #[serde(default)]
107    pub output: HashOut,
108    /// 提供则计算 HMAC(以此为密钥)。
109    #[serde(default, skip_serializing_if = "Option::is_none")]
110    pub hmac_key: Option<String>,
111    #[serde(default)]
112    pub hmac_key_enc: ByteEnc,
113}
114
115/// 对称加密算法。
116#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Hash)]
117#[serde(rename_all = "camelCase")]
118#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
119pub enum CipherAlgo {
120    Aes,
121    Des,
122    TripleDes,
123}
124
125/// 加密模式。
126#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Hash)]
127#[serde(rename_all = "lowercase")]
128#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
129pub enum CipherMode {
130    Cbc,
131    Ecb,
132    Cfb,
133    Gcm,
134}
135
136/// 填充方式(gcm 忽略)。
137#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize, Hash)]
138#[serde(rename_all = "lowercase")]
139#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
140pub enum Padding {
141    #[default]
142    Pkcs7,
143    Zero,
144    None,
145}
146
147/// 加解密方向。
148#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize, Hash)]
149#[serde(rename_all = "lowercase")]
150#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
151pub enum CipherOp {
152    #[default]
153    Decrypt,
154    Encrypt,
155}
156
157/// 加解密算子。默认值贴合「解密正文」主场景:`op=decrypt`、`inputEnc=base64`、`outputEnc=utf8`。
158#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Hash)]
159#[serde(rename_all = "camelCase", deny_unknown_fields)]
160#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
161pub struct CipherStep {
162    pub algo: CipherAlgo,
163    pub mode: CipherMode,
164    #[serde(default)]
165    pub padding: Padding,
166    #[serde(default)]
167    pub op: CipherOp,
168    pub key: String,
169    #[serde(default)]
170    pub key_enc: ByteEnc,
171    #[serde(default, skip_serializing_if = "Option::is_none")]
172    pub iv: Option<String>,
173    #[serde(default)]
174    pub iv_enc: ByteEnc,
175    /// 入参密文串→字节;省略时按 `op` 取默认(decrypt→base64,encrypt→utf8)。
176    #[serde(default, skip_serializing_if = "Option::is_none")]
177    pub input_enc: Option<ByteEnc>,
178    /// 结果字节→串;省略时按 `op` 取默认(decrypt→utf8,encrypt→base64)。
179    #[serde(default, skip_serializing_if = "Option::is_none")]
180    pub output_enc: Option<ByteEnc>,
181}
182
183/// 繁简转换方向。
184#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Hash)]
185#[serde(rename_all = "lowercase")]
186#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
187pub enum CnConvert {
188    /// 繁体 → 简体。
189    T2s,
190    /// 简体 → 繁体。
191    S2t,
192}
193
194/// 单步后处理。步内多算子按固定顺序执行:
195/// `regex/replace → trim → prepend → append → decode → encode → hash → cipher → cn`。
196#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize, Hash)]
197#[serde(deny_unknown_fields)]
198#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
199pub struct CleanStep {
200    #[serde(default, skip_serializing_if = "Option::is_none")]
201    pub regex: Option<String>,
202    #[serde(default, skip_serializing_if = "Option::is_none")]
203    pub replace: Option<String>,
204    #[serde(default, skip_serializing_if = "Option::is_none")]
205    pub trim: Option<bool>,
206    #[serde(default, skip_serializing_if = "Option::is_none")]
207    pub prepend: Option<String>,
208    #[serde(default, skip_serializing_if = "Option::is_none")]
209    pub append: Option<String>,
210    /// 解码(base64/base64url/hex/url)。
211    #[serde(default, skip_serializing_if = "Option::is_none")]
212    pub decode: Option<Codec>,
213    /// 编码(base64/base64url/hex/url)。
214    #[serde(default, skip_serializing_if = "Option::is_none")]
215    pub encode: Option<Codec>,
216    /// 哈希/HMAC。
217    #[serde(default, skip_serializing_if = "Option::is_none")]
218    pub hash: Option<HashStep>,
219    /// 对称加解密。
220    #[serde(default, skip_serializing_if = "Option::is_none")]
221    pub cipher: Option<CipherStep>,
222    /// 繁简转换。
223    #[serde(default, skip_serializing_if = "Option::is_none")]
224    pub cn: Option<CnConvert>,
225    /// JS 后处理(逃生舱;脚本里以当前串为 `result`)。需启用 `js` feature。
226    #[serde(default, skip_serializing_if = "Option::is_none")]
227    pub js: Option<String>,
228}
229
230/// 叶子规则:在当前上下文做一次抽取。
231#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize, Hash)]
232#[serde(rename_all = "camelCase")]
233#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
234pub struct LeafRule {
235    #[serde(default)]
236    pub via: Via,
237    #[serde(default, skip_serializing_if = "Option::is_none")]
238    pub select: Option<String>,
239    #[serde(default, skip_serializing_if = "Option::is_none")]
240    pub index: Option<i64>,
241    #[serde(default)]
242    pub extract: Extract,
243    #[serde(default, skip_serializing_if = "Vec::is_empty")]
244    pub clean: Vec<CleanStep>,
245}
246
247/// 一条规则:叶子,或组合子。组合子按其唯一键判别(见 design D1)。
248///
249/// 反序列化时按变体顺序尝试:组合子(各有唯一必填键)在前,叶子兜底。
250#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Hash)]
251#[serde(untagged)]
252#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
253pub enum Rule {
254    /// 取首个非空子规则结果(回退/自愈)。
255    FirstOf {
256        #[serde(rename = "firstOf")]
257        first_of: Vec<Rule>,
258    },
259    /// 拼接非空子规则结果。
260    Concat {
261        concat: Vec<Rule>,
262        #[serde(default)]
263        join: String,
264    },
265    /// 字面量。
266    Literal { literal: String },
267    /// 模板插值(`{{key}}`/`{{page}}`/命名变量)。
268    Template { template: String },
269    /// JS 逻辑编排逃生舱(值规则):以当前上下文为 `result`、注入 `baseUrl`/变量 + `crypto`
270    /// 助手求值,返回字符串。求值需启用 `js` feature(否则返回 `Unsupported("js")`)。
271    /// 必须置于 `Leaf` 之前——`js` 是其唯一判别键,否则会被全可选的 `Leaf` 吞掉。
272    Js { js: String },
273    /// 叶子(兜底)。
274    Leaf(LeafRule),
275}
276
277/// URL 字段:可为字符串模板,或一条规则。
278#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Hash)]
279#[serde(untagged)]
280#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
281pub enum UrlOrRule {
282    Str(String),
283    Rule(Box<Rule>),
284}
285
286// ───────────────────────── HTTP / 请求 ─────────────────────────
287
288/// 字符集。
289#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)]
290#[serde(rename_all = "kebab-case")]
291#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
292pub enum Charset {
293    #[default]
294    Auto,
295    Utf8,
296    Gbk,
297    Gb18030,
298    Big5,
299}
300
301/// 重试策略。
302#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
303#[serde(rename_all = "camelCase", deny_unknown_fields)]
304#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
305pub struct Retry {
306    #[serde(default)]
307    pub max: u32,
308    #[serde(default)]
309    pub backoff_ms: u64,
310}
311
312/// 速率限制。
313#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
314#[serde(rename_all = "camelCase", deny_unknown_fields)]
315#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
316pub struct RateLimit {
317    pub max_count: u64,
318    pub per_ms: u64,
319}
320
321/// 取页模式:是否动用浏览器解反爬挑战。
322/// 真正是否开浏览器还需 app/用户级授权(两级取交集,见 OpenSpec change `browser-fetcher` D12)。
323#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)]
324#[serde(rename_all = "lowercase")]
325#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
326pub enum FetchMode {
327    /// 默认:平时 reqwest,撞挑战才升级浏览器。
328    #[default]
329    Auto,
330    /// 永不开浏览器,撞挑战即降级。
331    Reqwest,
332    /// 整站强制走浏览器(首请求即被挑战 / 整页 JS 渲染)。
333    Browser,
334}
335
336/// HTTP 配置块。
337#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
338#[serde(rename_all = "camelCase", deny_unknown_fields)]
339#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
340pub struct Http {
341    #[serde(default)]
342    pub headers: HashMap<String, String>,
343    /// 静态 cookie;也是运行时注入 clearance cookie 的落点。
344    #[serde(default)]
345    pub cookies: HashMap<String, String>,
346    /// 先 GET 这些页以预热会话 cookie。
347    #[serde(default)]
348    pub warmup: Vec<String>,
349    #[serde(default)]
350    pub charset: Charset,
351    #[serde(default, skip_serializing_if = "Option::is_none")]
352    pub timeout: Option<u64>,
353    #[serde(default, skip_serializing_if = "Option::is_none")]
354    pub retry: Option<Retry>,
355    #[serde(default, skip_serializing_if = "Option::is_none")]
356    pub rate_limit: Option<RateLimit>,
357    /// 取页模式(auto|reqwest|browser);默认 auto。
358    #[serde(default)]
359    pub fetcher: FetchMode,
360}
361
362/// HTTP 方法。
363#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)]
364#[serde(rename_all = "UPPERCASE")]
365#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
366pub enum Method {
367    #[default]
368    Get,
369    Post,
370}
371
372/// 单个请求。
373#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
374#[serde(rename_all = "camelCase", deny_unknown_fields)]
375#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
376pub struct Request {
377    pub url: UrlOrRule,
378    #[serde(default)]
379    pub method: Method,
380    #[serde(default, skip_serializing_if = "Option::is_none")]
381    pub body: Option<UrlOrRule>,
382    #[serde(default)]
383    pub headers: HashMap<String, String>,
384    /// 命名捕获,供 template 使用。
385    #[serde(default)]
386    pub vars: HashMap<String, Rule>,
387}
388
389// ───────────────────────── 操作规则 ─────────────────────────
390
391/// 一本书的字段抽取规则(均可省略)。
392#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
393#[serde(rename_all = "camelCase", deny_unknown_fields)]
394#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
395pub struct BookRules {
396    /// 列表项:指向书详情页的链接(搜索/浏览结果用;bookInfo 阶段忽略)。
397    #[serde(default, skip_serializing_if = "Option::is_none")]
398    pub book_url: Option<Rule>,
399    #[serde(default, skip_serializing_if = "Option::is_none")]
400    pub name: Option<Rule>,
401    #[serde(default, skip_serializing_if = "Option::is_none")]
402    pub author: Option<Rule>,
403    #[serde(default, skip_serializing_if = "Option::is_none")]
404    pub cover: Option<Rule>,
405    #[serde(default, skip_serializing_if = "Option::is_none")]
406    pub intro: Option<Rule>,
407    #[serde(default, skip_serializing_if = "Option::is_none")]
408    pub kind: Option<Rule>,
409    #[serde(default, skip_serializing_if = "Option::is_none")]
410    pub last_chapter: Option<Rule>,
411    #[serde(default, skip_serializing_if = "Option::is_none")]
412    pub toc_url: Option<Rule>,
413    #[serde(default, skip_serializing_if = "Option::is_none")]
414    pub word_count: Option<Rule>,
415}
416
417/// 搜索操作。
418#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
419#[serde(deny_unknown_fields)]
420#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
421pub struct SearchOp {
422    pub request: Request,
423    pub list: Rule,
424    pub item: BookRules,
425}
426
427/// 浏览分类。
428#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Hash)]
429#[serde(deny_unknown_fields)]
430#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
431pub struct Category {
432    pub title: String,
433    pub url: UrlOrRule,
434}
435
436/// 浏览操作。
437#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
438#[serde(deny_unknown_fields)]
439#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
440pub struct ExploreOp {
441    pub categories: Vec<Category>,
442    pub list: Rule,
443    pub item: BookRules,
444}
445
446fn default_max_pages() -> u32 {
447    100
448}
449
450/// 目录规则(章节 + 分卷 + 可选分页)。
451#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
452#[serde(rename_all = "camelCase", deny_unknown_fields)]
453#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
454pub struct TocRules {
455    pub list: Rule,
456    pub name: Rule,
457    pub url: Rule,
458    #[serde(default, skip_serializing_if = "Option::is_none")]
459    pub is_volume: Option<Rule>,
460    #[serde(default, skip_serializing_if = "Option::is_none")]
461    pub next_page: Option<Rule>,
462    #[serde(default = "default_max_pages")]
463    pub max_pages: u32,
464}
465
466/// 正文规则(可选分页)。
467#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
468#[serde(rename_all = "camelCase", deny_unknown_fields)]
469#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
470pub struct ContentRules {
471    pub value: Rule,
472    #[serde(default, skip_serializing_if = "Option::is_none")]
473    pub next_page: Option<Rule>,
474    #[serde(default = "default_max_pages")]
475    pub max_pages: u32,
476}
477
478// ───────────────────────── 样例 ─────────────────────────
479
480/// 样例期望不变量。
481#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
482#[serde(rename_all = "camelCase")]
483#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
484pub struct Expect {
485    #[serde(default, skip_serializing_if = "Option::is_none")]
486    pub name: Option<String>,
487    #[serde(default, skip_serializing_if = "Option::is_none")]
488    pub min_chapters: Option<usize>,
489    #[serde(default, skip_serializing_if = "Option::is_none")]
490    pub volumes: Option<usize>,
491    #[serde(default, skip_serializing_if = "Option::is_none")]
492    pub min_content_chars: Option<usize>,
493}
494
495/// 黄金样例。
496#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
497#[serde(rename_all = "camelCase", deny_unknown_fields)]
498#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
499pub struct Sample {
500    pub book_url: String,
501    #[serde(default)]
502    pub expect: Expect,
503}
504
505// ───────────────────────── 顶层书源 ─────────────────────────
506
507/// v2 书源。
508#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
509#[serde(rename_all = "camelCase", deny_unknown_fields)]
510#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
511pub struct BookSource {
512    /// 固定为 `"trnovel-booksource/v2"`。
513    pub schema: String,
514    pub name: String,
515    #[serde(default, skip_serializing_if = "String::is_empty")]
516    pub group: String,
517    /// 站点基址,用于相对链接解析与 `{{base}}`。
518    pub url: String,
519    #[serde(default)]
520    pub http: Http,
521    #[serde(default, skip_serializing_if = "Option::is_none")]
522    pub search: Option<SearchOp>,
523    #[serde(default, skip_serializing_if = "Option::is_none")]
524    pub explore: Option<ExploreOp>,
525    pub book_info: BookRules,
526    pub toc: TocRules,
527    pub content: ContentRules,
528    #[serde(default, skip_serializing_if = "Vec::is_empty")]
529    pub samples: Vec<Sample>,
530}
531
532/// 期望的 schema 标识。
533pub const SCHEMA_ID: &str = "trnovel-booksource/v2";
534
535impl BookSource {
536    /// 从 JSON 字符串解析一个书源。
537    pub fn from_json(s: &str) -> Result<Self, ConfigError> {
538        Ok(serde_json::from_str(s)?)
539    }
540
541    /// 从 JSON 值解析一个或多个书源(支持单对象或数组)。
542    pub fn from_value_many(value: serde_json::Value) -> Result<Vec<Self>, ConfigError> {
543        if value.is_array() {
544            Ok(serde_json::from_value(value)?)
545        } else {
546            Ok(vec![serde_json::from_value(value)?])
547        }
548    }
549
550    /// 从本地文件导入(支持单对象或数组)。
551    pub fn from_path(path: &str) -> Result<Vec<Self>, super::error::BookSourceError> {
552        let text = std::fs::read_to_string(path).map_err(ConfigError::Io)?;
553        let value = serde_json::from_str(&text).map_err(ConfigError::Json)?;
554        Ok(Self::from_value_many(value)?)
555    }
556
557    /// 从网络 URL 导入(支持单对象或数组)。
558    pub async fn from_url(url: &str) -> Result<Vec<Self>, super::error::BookSourceError> {
559        use super::error::FetchError;
560        let text = reqwest::get(url)
561            .await
562            .map_err(FetchError::Http)?
563            // 先判 HTTP 状态:4xx/5xx 返回错误页时,避免把"非 JSON"误报成 JSON 解析失败。
564            .error_for_status()
565            .map_err(FetchError::Http)?
566            .text()
567            .await
568            .map_err(FetchError::Http)?;
569        let value = serde_json::from_str(&text).map_err(ConfigError::Json)?;
570        Ok(Self::from_value_many(value)?)
571    }
572}
573
574#[cfg(test)]
575mod tests {
576    use super::*;
577
578    /// 与 examples/bilixs.v2.json 同构的代表性书源(覆盖 leaf / firstOf / concat /
579    /// template / attr / http+cookies / samples / 分卷 isVolume)。
580    const BILIXS_V2: &str = r#"{
581      "schema": "trnovel-booksource/v2",
582      "name": "哔哩小说",
583      "group": "测试",
584      "url": "https://www.bilixs.com",
585      "http": {
586        "headers": { "User-Agent": "Mozilla/5.0" },
587        "cookies": {},
588        "warmup": ["https://www.bilixs.com/"],
589        "charset": "auto",
590        "timeout": 15000,
591        "retry": { "max": 2, "backoffMs": 500 }
592      },
593      "search": {
594        "request": { "url": { "template": "{{base}}/search.html?searchkey={{key}}" }, "method": "GET" },
595        "list": { "via": "css", "select": ".module-item" },
596        "item": {
597          "name": { "via": "css", "select": ".module-item-title", "extract": "text" },
598          "tocUrl": { "via": "css", "select": ".module-item-title", "extract": { "attr": "href" } }
599        }
600      },
601      "explore": {
602        "categories": [ { "title": "最近更新", "url": { "template": "{{base}}/book/lastupdate_0_1_0_0_0_0_0_{{page}}_0.html" } } ],
603        "list": { "via": "css", "select": ".module-item" },
604        "item": { "name": { "via": "css", "select": ".module-item-title", "extract": "text" } }
605      },
606      "bookInfo": {
607        "name": { "via": "css", "select": "[property=\"og:novel:book_name\"]", "extract": { "attr": "content" } },
608        "cover": { "via": "css", "select": "[property=\"og:image\"]", "extract": { "attr": "content" } },
609        "kind": { "concat": [
610            { "via": "css", "select": "[property=\"og:novel:tags\"]", "extract": { "attr": "content" } },
611            { "via": "css", "select": "[property=\"og:novel:status\"]", "extract": { "attr": "content" } }
612          ], "join": " · " },
613        "tocUrl": { "via": "css", "select": "[property=\"og:novel:read_url\"]", "extract": { "attr": "content" } }
614      },
615      "toc": {
616        "list": { "via": "css", "select": ".box > h2.module-title.type, .box a.module-row-text" },
617        "name": { "firstOf": [
618            { "via": "css", "select": ".module-row-title", "extract": "text" },
619            { "via": "css", "select": "h2", "extract": "text" }
620          ] },
621        "url": { "via": "css", "select": "a", "extract": { "attr": "href" } },
622        "isVolume": { "via": "css", "select": "h2", "extract": "text" },
623        "maxPages": 1
624      },
625      "content": {
626        "value": { "via": "css", "select": ".article-content", "extract": "html",
627          "clean": [ { "regex": "请收藏本站[^<\\n]*", "replace": "" }, { "trim": true } ] }
628      },
629      "samples": [
630        { "bookUrl": "/novel/guzhenren.html", "expect": { "name": "蛊真人", "volumes": 8, "minChapters": 2000 } }
631      ]
632    }"#;
633
634    #[test]
635    fn parses_v2_book_source() {
636        let bs = BookSource::from_json(BILIXS_V2).expect("应解析 v2 书源");
637        assert_eq!(bs.schema, SCHEMA_ID);
638        assert_eq!(bs.name, "哔哩小说");
639    }
640
641    #[test]
642    fn toc_name_is_firstof_with_two_leaves() {
643        let bs = BookSource::from_json(BILIXS_V2).unwrap();
644        match &bs.toc.name {
645            Rule::FirstOf { first_of } => assert_eq!(first_of.len(), 2),
646            other => panic!("toc.name 应为 firstOf,实际 {other:?}"),
647        }
648    }
649
650    #[test]
651    fn toc_is_volume_is_leaf_css_h2() {
652        let bs = BookSource::from_json(BILIXS_V2).unwrap();
653        let iv = bs.toc.is_volume.as_ref().expect("isVolume 应存在");
654        match iv {
655            Rule::Leaf(l) => {
656                assert_eq!(l.via, Via::Css);
657                assert_eq!(l.select.as_deref(), Some("h2"));
658            }
659            other => panic!("isVolume 应为叶子,实际 {other:?}"),
660        }
661    }
662
663    #[test]
664    fn search_url_is_template_rule() {
665        let bs = BookSource::from_json(BILIXS_V2).unwrap();
666        let req = &bs.search.as_ref().unwrap().request;
667        match &req.url {
668            UrlOrRule::Rule(r) => assert!(matches!(**r, Rule::Template { .. })),
669            other => panic!("search.request.url 应为模板规则,实际 {other:?}"),
670        }
671    }
672
673    #[test]
674    fn book_info_cover_extracts_attr() {
675        let bs = BookSource::from_json(BILIXS_V2).unwrap();
676        match bs.book_info.cover.as_ref().unwrap() {
677            Rule::Leaf(l) => assert_eq!(
678                l.extract,
679                Extract::Attr {
680                    attr: "content".into()
681                }
682            ),
683            other => panic!("cover 应为属性抽取叶子,实际 {other:?}"),
684        }
685    }
686
687    #[test]
688    fn http_cookies_and_warmup_parsed() {
689        let bs = BookSource::from_json(BILIXS_V2).unwrap();
690        assert_eq!(bs.http.warmup, vec!["https://www.bilixs.com/"]);
691        assert_eq!(bs.http.charset, Charset::Auto);
692        assert_eq!(bs.http.retry.as_ref().unwrap().backoff_ms, 500);
693    }
694
695    #[test]
696    fn sample_expectations_parsed() {
697        let bs = BookSource::from_json(BILIXS_V2).unwrap();
698        let s = &bs.samples[0];
699        assert_eq!(s.expect.volumes, Some(8));
700        assert_eq!(s.expect.min_chapters, Some(2000));
701    }
702
703    #[test]
704    fn round_trips_through_json() {
705        let bs = BookSource::from_json(BILIXS_V2).unwrap();
706        let json = serde_json::to_string(&bs).unwrap();
707        let bs2 = BookSource::from_json(&json).unwrap();
708        assert_eq!(bs, bs2);
709    }
710
711    #[test]
712    fn rejects_unknown_top_level_field() {
713        let bad = BILIXS_V2.replacen("\"name\":", "\"nmae\":", 1);
714        assert!(
715            BookSource::from_json(&bad).is_err(),
716            "拼错字段应被 deny_unknown_fields 拒绝"
717        );
718    }
719}
720
721/// 防漂移:`book-source.schema.json` 必须等于从类型现生成的 schema(`--features schema`)。
722/// 失败说明改了配置类型却没重新生成 schema——按提示重跑 gen_schema 即可。
723#[cfg(all(test, feature = "schema"))]
724mod schema_sync {
725    #[test]
726    fn schema_is_in_sync() {
727        let generated =
728            serde_json::to_string_pretty(&schemars::schema_for!(crate::BookSource)).unwrap();
729        let committed = include_str!("../book-source.schema.json");
730        assert_eq!(
731            generated.trim(),
732            committed.trim(),
733            "book-source.schema.json 与配置类型不同步;请重新生成:\n  \
734             cargo run -p parse-book-source --features schema --example gen_schema \
735             > crates/parse-book-source/book-source.schema.json"
736        );
737    }
738}