1use super::error::ConfigError;
7use serde::{Deserialize, Serialize};
8use std::collections::HashMap;
9
10#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize, Hash)]
14#[serde(rename_all = "lowercase")]
15#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
16pub enum Via {
17 #[default]
18 Css,
19 Xpath,
20 Json,
21 Regex,
22 Raw,
24}
25
26#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Hash)]
28#[serde(untagged)]
29#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
30pub enum Extract {
31 Op(ExtractOp),
32 Attr { attr: String },
33}
34
35impl Default for Extract {
36 fn default() -> Self {
37 Extract::Op(ExtractOp::Text)
38 }
39}
40
41#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize, Hash)]
43#[serde(rename_all = "camelCase")]
44#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
45pub enum ExtractOp {
46 #[default]
47 Text,
48 OwnText,
49 Html,
50 InnerHtml,
51 OuterHtml,
52}
53
54#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize, Hash)]
56#[serde(deny_unknown_fields)]
57#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
58pub struct CleanStep {
59 #[serde(default, skip_serializing_if = "Option::is_none")]
60 pub regex: Option<String>,
61 #[serde(default, skip_serializing_if = "Option::is_none")]
62 pub replace: Option<String>,
63 #[serde(default, skip_serializing_if = "Option::is_none")]
64 pub trim: Option<bool>,
65 #[serde(default, skip_serializing_if = "Option::is_none")]
66 pub prepend: Option<String>,
67 #[serde(default, skip_serializing_if = "Option::is_none")]
68 pub append: Option<String>,
69}
70
71#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize, Hash)]
73#[serde(rename_all = "camelCase")]
74#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
75pub struct LeafRule {
76 #[serde(default)]
77 pub via: Via,
78 #[serde(default, skip_serializing_if = "Option::is_none")]
79 pub select: Option<String>,
80 #[serde(default, skip_serializing_if = "Option::is_none")]
81 pub index: Option<i64>,
82 #[serde(default)]
83 pub extract: Extract,
84 #[serde(default, skip_serializing_if = "Vec::is_empty")]
85 pub clean: Vec<CleanStep>,
86}
87
88#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Hash)]
92#[serde(untagged)]
93#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
94pub enum Rule {
95 FirstOf {
97 #[serde(rename = "firstOf")]
98 first_of: Vec<Rule>,
99 },
100 Concat {
102 concat: Vec<Rule>,
103 #[serde(default)]
104 join: String,
105 },
106 Literal { literal: String },
108 Template { template: String },
110 Leaf(LeafRule),
112}
113
114#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Hash)]
116#[serde(untagged)]
117#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
118pub enum UrlOrRule {
119 Str(String),
120 Rule(Box<Rule>),
121}
122
123#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)]
127#[serde(rename_all = "kebab-case")]
128#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
129pub enum Charset {
130 #[default]
131 Auto,
132 Utf8,
133 Gbk,
134 Gb18030,
135 Big5,
136}
137
138#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
140#[serde(rename_all = "camelCase", deny_unknown_fields)]
141#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
142pub struct Retry {
143 #[serde(default)]
144 pub max: u32,
145 #[serde(default)]
146 pub backoff_ms: u64,
147}
148
149#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
151#[serde(rename_all = "camelCase", deny_unknown_fields)]
152#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
153pub struct RateLimit {
154 pub max_count: u64,
155 pub per_ms: u64,
156}
157
158#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)]
161#[serde(rename_all = "lowercase")]
162#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
163pub enum FetchMode {
164 #[default]
166 Auto,
167 Reqwest,
169 Browser,
171}
172
173#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
175#[serde(rename_all = "camelCase", deny_unknown_fields)]
176#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
177pub struct Http {
178 #[serde(default)]
179 pub headers: HashMap<String, String>,
180 #[serde(default)]
182 pub cookies: HashMap<String, String>,
183 #[serde(default)]
185 pub warmup: Vec<String>,
186 #[serde(default)]
187 pub charset: Charset,
188 #[serde(default, skip_serializing_if = "Option::is_none")]
189 pub timeout: Option<u64>,
190 #[serde(default, skip_serializing_if = "Option::is_none")]
191 pub retry: Option<Retry>,
192 #[serde(default, skip_serializing_if = "Option::is_none")]
193 pub rate_limit: Option<RateLimit>,
194 #[serde(default)]
196 pub fetcher: FetchMode,
197}
198
199#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)]
201#[serde(rename_all = "UPPERCASE")]
202#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
203pub enum Method {
204 #[default]
205 Get,
206 Post,
207}
208
209#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
211#[serde(rename_all = "camelCase", deny_unknown_fields)]
212#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
213pub struct Request {
214 pub url: UrlOrRule,
215 #[serde(default)]
216 pub method: Method,
217 #[serde(default, skip_serializing_if = "Option::is_none")]
218 pub body: Option<UrlOrRule>,
219 #[serde(default)]
220 pub headers: HashMap<String, String>,
221 #[serde(default)]
223 pub vars: HashMap<String, Rule>,
224}
225
226#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
230#[serde(rename_all = "camelCase", deny_unknown_fields)]
231#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
232pub struct BookRules {
233 #[serde(default, skip_serializing_if = "Option::is_none")]
235 pub book_url: Option<Rule>,
236 #[serde(default, skip_serializing_if = "Option::is_none")]
237 pub name: Option<Rule>,
238 #[serde(default, skip_serializing_if = "Option::is_none")]
239 pub author: Option<Rule>,
240 #[serde(default, skip_serializing_if = "Option::is_none")]
241 pub cover: Option<Rule>,
242 #[serde(default, skip_serializing_if = "Option::is_none")]
243 pub intro: Option<Rule>,
244 #[serde(default, skip_serializing_if = "Option::is_none")]
245 pub kind: Option<Rule>,
246 #[serde(default, skip_serializing_if = "Option::is_none")]
247 pub last_chapter: Option<Rule>,
248 #[serde(default, skip_serializing_if = "Option::is_none")]
249 pub toc_url: Option<Rule>,
250 #[serde(default, skip_serializing_if = "Option::is_none")]
251 pub word_count: Option<Rule>,
252}
253
254#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
256#[serde(deny_unknown_fields)]
257#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
258pub struct SearchOp {
259 pub request: Request,
260 pub list: Rule,
261 pub item: BookRules,
262}
263
264#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Hash)]
266#[serde(deny_unknown_fields)]
267#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
268pub struct Category {
269 pub title: String,
270 pub url: UrlOrRule,
271}
272
273#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
275#[serde(deny_unknown_fields)]
276#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
277pub struct ExploreOp {
278 pub categories: Vec<Category>,
279 pub list: Rule,
280 pub item: BookRules,
281}
282
283fn default_max_pages() -> u32 {
284 100
285}
286
287#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
289#[serde(rename_all = "camelCase", deny_unknown_fields)]
290#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
291pub struct TocRules {
292 pub list: Rule,
293 pub name: Rule,
294 pub url: Rule,
295 #[serde(default, skip_serializing_if = "Option::is_none")]
296 pub is_volume: Option<Rule>,
297 #[serde(default, skip_serializing_if = "Option::is_none")]
298 pub next_page: Option<Rule>,
299 #[serde(default = "default_max_pages")]
300 pub max_pages: u32,
301}
302
303#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
305#[serde(rename_all = "camelCase", deny_unknown_fields)]
306#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
307pub struct ContentRules {
308 pub value: Rule,
309 #[serde(default, skip_serializing_if = "Option::is_none")]
310 pub next_page: Option<Rule>,
311 #[serde(default = "default_max_pages")]
312 pub max_pages: u32,
313}
314
315#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
319#[serde(rename_all = "camelCase")]
320#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
321pub struct Expect {
322 #[serde(default, skip_serializing_if = "Option::is_none")]
323 pub name: Option<String>,
324 #[serde(default, skip_serializing_if = "Option::is_none")]
325 pub min_chapters: Option<usize>,
326 #[serde(default, skip_serializing_if = "Option::is_none")]
327 pub volumes: Option<usize>,
328 #[serde(default, skip_serializing_if = "Option::is_none")]
329 pub min_content_chars: Option<usize>,
330}
331
332#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
334#[serde(rename_all = "camelCase", deny_unknown_fields)]
335#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
336pub struct Sample {
337 pub book_url: String,
338 #[serde(default)]
339 pub expect: Expect,
340}
341
342#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
346#[serde(rename_all = "camelCase", deny_unknown_fields)]
347#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
348pub struct BookSource {
349 pub schema: String,
351 pub name: String,
352 #[serde(default, skip_serializing_if = "String::is_empty")]
353 pub group: String,
354 pub url: String,
356 #[serde(default)]
357 pub http: Http,
358 #[serde(default, skip_serializing_if = "Option::is_none")]
359 pub search: Option<SearchOp>,
360 #[serde(default, skip_serializing_if = "Option::is_none")]
361 pub explore: Option<ExploreOp>,
362 pub book_info: BookRules,
363 pub toc: TocRules,
364 pub content: ContentRules,
365 #[serde(default, skip_serializing_if = "Vec::is_empty")]
366 pub samples: Vec<Sample>,
367}
368
369pub const SCHEMA_ID: &str = "trnovel-booksource/v2";
371
372impl BookSource {
373 pub fn from_json(s: &str) -> Result<Self, ConfigError> {
375 Ok(serde_json::from_str(s)?)
376 }
377
378 pub fn from_value_many(value: serde_json::Value) -> Result<Vec<Self>, ConfigError> {
380 if value.is_array() {
381 Ok(serde_json::from_value(value)?)
382 } else {
383 Ok(vec![serde_json::from_value(value)?])
384 }
385 }
386
387 pub fn from_path(path: &str) -> Result<Vec<Self>, super::error::BookSourceError> {
389 let text = std::fs::read_to_string(path).map_err(ConfigError::Io)?;
390 let value = serde_json::from_str(&text).map_err(ConfigError::Json)?;
391 Ok(Self::from_value_many(value)?)
392 }
393
394 pub async fn from_url(url: &str) -> Result<Vec<Self>, super::error::BookSourceError> {
396 use super::error::FetchError;
397 let text = reqwest::get(url)
398 .await
399 .map_err(FetchError::Http)?
400 .error_for_status()
402 .map_err(FetchError::Http)?
403 .text()
404 .await
405 .map_err(FetchError::Http)?;
406 let value = serde_json::from_str(&text).map_err(ConfigError::Json)?;
407 Ok(Self::from_value_many(value)?)
408 }
409}
410
411#[cfg(test)]
412mod tests {
413 use super::*;
414
415 const BILIXS_V2: &str = r#"{
418 "schema": "trnovel-booksource/v2",
419 "name": "哔哩小说",
420 "group": "测试",
421 "url": "https://www.bilixs.com",
422 "http": {
423 "headers": { "User-Agent": "Mozilla/5.0" },
424 "cookies": {},
425 "warmup": ["https://www.bilixs.com/"],
426 "charset": "auto",
427 "timeout": 15000,
428 "retry": { "max": 2, "backoffMs": 500 }
429 },
430 "search": {
431 "request": { "url": { "template": "{{base}}/search.html?searchkey={{key}}" }, "method": "GET" },
432 "list": { "via": "css", "select": ".module-item" },
433 "item": {
434 "name": { "via": "css", "select": ".module-item-title", "extract": "text" },
435 "tocUrl": { "via": "css", "select": ".module-item-title", "extract": { "attr": "href" } }
436 }
437 },
438 "explore": {
439 "categories": [ { "title": "最近更新", "url": { "template": "{{base}}/book/lastupdate_0_1_0_0_0_0_0_{{page}}_0.html" } } ],
440 "list": { "via": "css", "select": ".module-item" },
441 "item": { "name": { "via": "css", "select": ".module-item-title", "extract": "text" } }
442 },
443 "bookInfo": {
444 "name": { "via": "css", "select": "[property=\"og:novel:book_name\"]", "extract": { "attr": "content" } },
445 "cover": { "via": "css", "select": "[property=\"og:image\"]", "extract": { "attr": "content" } },
446 "kind": { "concat": [
447 { "via": "css", "select": "[property=\"og:novel:tags\"]", "extract": { "attr": "content" } },
448 { "via": "css", "select": "[property=\"og:novel:status\"]", "extract": { "attr": "content" } }
449 ], "join": " · " },
450 "tocUrl": { "via": "css", "select": "[property=\"og:novel:read_url\"]", "extract": { "attr": "content" } }
451 },
452 "toc": {
453 "list": { "via": "css", "select": ".box > h2.module-title.type, .box a.module-row-text" },
454 "name": { "firstOf": [
455 { "via": "css", "select": ".module-row-title", "extract": "text" },
456 { "via": "css", "select": "h2", "extract": "text" }
457 ] },
458 "url": { "via": "css", "select": "a", "extract": { "attr": "href" } },
459 "isVolume": { "via": "css", "select": "h2", "extract": "text" },
460 "maxPages": 1
461 },
462 "content": {
463 "value": { "via": "css", "select": ".article-content", "extract": "html",
464 "clean": [ { "regex": "请收藏本站[^<\\n]*", "replace": "" }, { "trim": true } ] }
465 },
466 "samples": [
467 { "bookUrl": "/novel/guzhenren.html", "expect": { "name": "蛊真人", "volumes": 8, "minChapters": 2000 } }
468 ]
469 }"#;
470
471 #[test]
472 fn parses_v2_book_source() {
473 let bs = BookSource::from_json(BILIXS_V2).expect("应解析 v2 书源");
474 assert_eq!(bs.schema, SCHEMA_ID);
475 assert_eq!(bs.name, "哔哩小说");
476 }
477
478 #[test]
479 fn toc_name_is_firstof_with_two_leaves() {
480 let bs = BookSource::from_json(BILIXS_V2).unwrap();
481 match &bs.toc.name {
482 Rule::FirstOf { first_of } => assert_eq!(first_of.len(), 2),
483 other => panic!("toc.name 应为 firstOf,实际 {other:?}"),
484 }
485 }
486
487 #[test]
488 fn toc_is_volume_is_leaf_css_h2() {
489 let bs = BookSource::from_json(BILIXS_V2).unwrap();
490 let iv = bs.toc.is_volume.as_ref().expect("isVolume 应存在");
491 match iv {
492 Rule::Leaf(l) => {
493 assert_eq!(l.via, Via::Css);
494 assert_eq!(l.select.as_deref(), Some("h2"));
495 }
496 other => panic!("isVolume 应为叶子,实际 {other:?}"),
497 }
498 }
499
500 #[test]
501 fn search_url_is_template_rule() {
502 let bs = BookSource::from_json(BILIXS_V2).unwrap();
503 let req = &bs.search.as_ref().unwrap().request;
504 match &req.url {
505 UrlOrRule::Rule(r) => assert!(matches!(**r, Rule::Template { .. })),
506 other => panic!("search.request.url 应为模板规则,实际 {other:?}"),
507 }
508 }
509
510 #[test]
511 fn book_info_cover_extracts_attr() {
512 let bs = BookSource::from_json(BILIXS_V2).unwrap();
513 match bs.book_info.cover.as_ref().unwrap() {
514 Rule::Leaf(l) => assert_eq!(
515 l.extract,
516 Extract::Attr {
517 attr: "content".into()
518 }
519 ),
520 other => panic!("cover 应为属性抽取叶子,实际 {other:?}"),
521 }
522 }
523
524 #[test]
525 fn http_cookies_and_warmup_parsed() {
526 let bs = BookSource::from_json(BILIXS_V2).unwrap();
527 assert_eq!(bs.http.warmup, vec!["https://www.bilixs.com/"]);
528 assert_eq!(bs.http.charset, Charset::Auto);
529 assert_eq!(bs.http.retry.as_ref().unwrap().backoff_ms, 500);
530 }
531
532 #[test]
533 fn sample_expectations_parsed() {
534 let bs = BookSource::from_json(BILIXS_V2).unwrap();
535 let s = &bs.samples[0];
536 assert_eq!(s.expect.volumes, Some(8));
537 assert_eq!(s.expect.min_chapters, Some(2000));
538 }
539
540 #[test]
541 fn round_trips_through_json() {
542 let bs = BookSource::from_json(BILIXS_V2).unwrap();
543 let json = serde_json::to_string(&bs).unwrap();
544 let bs2 = BookSource::from_json(&json).unwrap();
545 assert_eq!(bs, bs2);
546 }
547
548 #[test]
549 fn rejects_unknown_top_level_field() {
550 let bad = BILIXS_V2.replacen("\"name\":", "\"nmae\":", 1);
551 assert!(
552 BookSource::from_json(&bad).is_err(),
553 "拼错字段应被 deny_unknown_fields 拒绝"
554 );
555 }
556}
557
558#[cfg(all(test, feature = "schema"))]
561mod schema_sync {
562 #[test]
563 fn schema_is_in_sync() {
564 let generated =
565 serde_json::to_string_pretty(&schemars::schema_for!(crate::BookSource)).unwrap();
566 let committed = include_str!("../book-source.schema.json");
567 assert_eq!(
568 generated.trim(),
569 committed.trim(),
570 "book-source.schema.json 与配置类型不同步;请重新生成:\n \
571 cargo run -p parse-book-source --features schema --example gen_schema \
572 > crates/parse-book-source/book-source.schema.json"
573 );
574 }
575}