use super::error::ConfigError;
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize, Hash)]
#[serde(rename_all = "lowercase")]
#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
pub enum Via {
#[default]
Css,
Xpath,
Json,
Regex,
Raw,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Hash)]
#[serde(untagged)]
#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
pub enum Extract {
Op(ExtractOp),
Attr { attr: String },
}
impl Default for Extract {
fn default() -> Self {
Extract::Op(ExtractOp::Text)
}
}
#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize, Hash)]
#[serde(rename_all = "camelCase")]
#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
pub enum ExtractOp {
#[default]
Text,
OwnText,
Html,
InnerHtml,
OuterHtml,
}
#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize, Hash)]
#[serde(deny_unknown_fields)]
#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
pub struct CleanStep {
#[serde(default, skip_serializing_if = "Option::is_none")]
pub regex: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub replace: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub trim: Option<bool>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub prepend: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub append: Option<String>,
}
#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize, Hash)]
#[serde(rename_all = "camelCase")]
#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
pub struct LeafRule {
#[serde(default)]
pub via: Via,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub select: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub index: Option<i64>,
#[serde(default)]
pub extract: Extract,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub clean: Vec<CleanStep>,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Hash)]
#[serde(untagged)]
#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
pub enum Rule {
FirstOf {
#[serde(rename = "firstOf")]
first_of: Vec<Rule>,
},
Concat {
concat: Vec<Rule>,
#[serde(default)]
join: String,
},
Literal { literal: String },
Template { template: String },
Leaf(LeafRule),
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Hash)]
#[serde(untagged)]
#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
pub enum UrlOrRule {
Str(String),
Rule(Box<Rule>),
}
#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "kebab-case")]
#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
pub enum Charset {
#[default]
Auto,
Utf8,
Gbk,
Gb18030,
Big5,
}
#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "camelCase", deny_unknown_fields)]
#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
pub struct Retry {
#[serde(default)]
pub max: u32,
#[serde(default)]
pub backoff_ms: u64,
}
#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "camelCase", deny_unknown_fields)]
#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
pub struct RateLimit {
pub max_count: u64,
pub per_ms: u64,
}
#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "lowercase")]
#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
pub enum FetchMode {
#[default]
Auto,
Reqwest,
Browser,
}
#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "camelCase", deny_unknown_fields)]
#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
pub struct Http {
#[serde(default)]
pub headers: HashMap<String, String>,
#[serde(default)]
pub cookies: HashMap<String, String>,
#[serde(default)]
pub warmup: Vec<String>,
#[serde(default)]
pub charset: Charset,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub timeout: Option<u64>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub retry: Option<Retry>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub rate_limit: Option<RateLimit>,
#[serde(default)]
pub fetcher: FetchMode,
}
#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "UPPERCASE")]
#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
pub enum Method {
#[default]
Get,
Post,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "camelCase", deny_unknown_fields)]
#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
pub struct Request {
pub url: UrlOrRule,
#[serde(default)]
pub method: Method,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub body: Option<UrlOrRule>,
#[serde(default)]
pub headers: HashMap<String, String>,
#[serde(default)]
pub vars: HashMap<String, Rule>,
}
#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "camelCase", deny_unknown_fields)]
#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
pub struct BookRules {
#[serde(default, skip_serializing_if = "Option::is_none")]
pub book_url: Option<Rule>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub name: Option<Rule>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub author: Option<Rule>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub cover: Option<Rule>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub intro: Option<Rule>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub kind: Option<Rule>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub last_chapter: Option<Rule>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub toc_url: Option<Rule>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub word_count: Option<Rule>,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
#[serde(deny_unknown_fields)]
#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
pub struct SearchOp {
pub request: Request,
pub list: Rule,
pub item: BookRules,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Hash)]
#[serde(deny_unknown_fields)]
#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
pub struct Category {
pub title: String,
pub url: UrlOrRule,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
#[serde(deny_unknown_fields)]
#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
pub struct ExploreOp {
pub categories: Vec<Category>,
pub list: Rule,
pub item: BookRules,
}
fn default_max_pages() -> u32 {
100
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "camelCase", deny_unknown_fields)]
#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
pub struct TocRules {
pub list: Rule,
pub name: Rule,
pub url: Rule,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub is_volume: Option<Rule>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub next_page: Option<Rule>,
#[serde(default = "default_max_pages")]
pub max_pages: u32,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "camelCase", deny_unknown_fields)]
#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
pub struct ContentRules {
pub value: Rule,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub next_page: Option<Rule>,
#[serde(default = "default_max_pages")]
pub max_pages: u32,
}
#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
pub struct Expect {
#[serde(default, skip_serializing_if = "Option::is_none")]
pub name: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub min_chapters: Option<usize>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub volumes: Option<usize>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub min_content_chars: Option<usize>,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "camelCase", deny_unknown_fields)]
#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
pub struct Sample {
pub book_url: String,
#[serde(default)]
pub expect: Expect,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "camelCase", deny_unknown_fields)]
#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
pub struct BookSource {
pub schema: String,
pub name: String,
#[serde(default, skip_serializing_if = "String::is_empty")]
pub group: String,
pub url: String,
#[serde(default)]
pub http: Http,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub search: Option<SearchOp>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub explore: Option<ExploreOp>,
pub book_info: BookRules,
pub toc: TocRules,
pub content: ContentRules,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub samples: Vec<Sample>,
}
pub const SCHEMA_ID: &str = "trnovel-booksource/v2";
impl BookSource {
pub fn from_json(s: &str) -> Result<Self, ConfigError> {
Ok(serde_json::from_str(s)?)
}
pub fn from_value_many(value: serde_json::Value) -> Result<Vec<Self>, ConfigError> {
if value.is_array() {
Ok(serde_json::from_value(value)?)
} else {
Ok(vec![serde_json::from_value(value)?])
}
}
pub fn from_path(path: &str) -> Result<Vec<Self>, super::error::BookSourceError> {
let text = std::fs::read_to_string(path).map_err(ConfigError::Io)?;
let value = serde_json::from_str(&text).map_err(ConfigError::Json)?;
Ok(Self::from_value_many(value)?)
}
pub async fn from_url(url: &str) -> Result<Vec<Self>, super::error::BookSourceError> {
use super::error::FetchError;
let text = reqwest::get(url)
.await
.map_err(FetchError::Http)?
.error_for_status()
.map_err(FetchError::Http)?
.text()
.await
.map_err(FetchError::Http)?;
let value = serde_json::from_str(&text).map_err(ConfigError::Json)?;
Ok(Self::from_value_many(value)?)
}
}
#[cfg(test)]
mod tests {
use super::*;
const BILIXS_V2: &str = r#"{
"schema": "trnovel-booksource/v2",
"name": "哔哩小说",
"group": "测试",
"url": "https://www.bilixs.com",
"http": {
"headers": { "User-Agent": "Mozilla/5.0" },
"cookies": {},
"warmup": ["https://www.bilixs.com/"],
"charset": "auto",
"timeout": 15000,
"retry": { "max": 2, "backoffMs": 500 }
},
"search": {
"request": { "url": { "template": "{{base}}/search.html?searchkey={{key}}" }, "method": "GET" },
"list": { "via": "css", "select": ".module-item" },
"item": {
"name": { "via": "css", "select": ".module-item-title", "extract": "text" },
"tocUrl": { "via": "css", "select": ".module-item-title", "extract": { "attr": "href" } }
}
},
"explore": {
"categories": [ { "title": "最近更新", "url": { "template": "{{base}}/book/lastupdate_0_1_0_0_0_0_0_{{page}}_0.html" } } ],
"list": { "via": "css", "select": ".module-item" },
"item": { "name": { "via": "css", "select": ".module-item-title", "extract": "text" } }
},
"bookInfo": {
"name": { "via": "css", "select": "[property=\"og:novel:book_name\"]", "extract": { "attr": "content" } },
"cover": { "via": "css", "select": "[property=\"og:image\"]", "extract": { "attr": "content" } },
"kind": { "concat": [
{ "via": "css", "select": "[property=\"og:novel:tags\"]", "extract": { "attr": "content" } },
{ "via": "css", "select": "[property=\"og:novel:status\"]", "extract": { "attr": "content" } }
], "join": " · " },
"tocUrl": { "via": "css", "select": "[property=\"og:novel:read_url\"]", "extract": { "attr": "content" } }
},
"toc": {
"list": { "via": "css", "select": ".box > h2.module-title.type, .box a.module-row-text" },
"name": { "firstOf": [
{ "via": "css", "select": ".module-row-title", "extract": "text" },
{ "via": "css", "select": "h2", "extract": "text" }
] },
"url": { "via": "css", "select": "a", "extract": { "attr": "href" } },
"isVolume": { "via": "css", "select": "h2", "extract": "text" },
"maxPages": 1
},
"content": {
"value": { "via": "css", "select": ".article-content", "extract": "html",
"clean": [ { "regex": "请收藏本站[^<\\n]*", "replace": "" }, { "trim": true } ] }
},
"samples": [
{ "bookUrl": "/novel/guzhenren.html", "expect": { "name": "蛊真人", "volumes": 8, "minChapters": 2000 } }
]
}"#;
#[test]
fn parses_v2_book_source() {
let bs = BookSource::from_json(BILIXS_V2).expect("应解析 v2 书源");
assert_eq!(bs.schema, SCHEMA_ID);
assert_eq!(bs.name, "哔哩小说");
}
#[test]
fn toc_name_is_firstof_with_two_leaves() {
let bs = BookSource::from_json(BILIXS_V2).unwrap();
match &bs.toc.name {
Rule::FirstOf { first_of } => assert_eq!(first_of.len(), 2),
other => panic!("toc.name 应为 firstOf,实际 {other:?}"),
}
}
#[test]
fn toc_is_volume_is_leaf_css_h2() {
let bs = BookSource::from_json(BILIXS_V2).unwrap();
let iv = bs.toc.is_volume.as_ref().expect("isVolume 应存在");
match iv {
Rule::Leaf(l) => {
assert_eq!(l.via, Via::Css);
assert_eq!(l.select.as_deref(), Some("h2"));
}
other => panic!("isVolume 应为叶子,实际 {other:?}"),
}
}
#[test]
fn search_url_is_template_rule() {
let bs = BookSource::from_json(BILIXS_V2).unwrap();
let req = &bs.search.as_ref().unwrap().request;
match &req.url {
UrlOrRule::Rule(r) => assert!(matches!(**r, Rule::Template { .. })),
other => panic!("search.request.url 应为模板规则,实际 {other:?}"),
}
}
#[test]
fn book_info_cover_extracts_attr() {
let bs = BookSource::from_json(BILIXS_V2).unwrap();
match bs.book_info.cover.as_ref().unwrap() {
Rule::Leaf(l) => assert_eq!(
l.extract,
Extract::Attr {
attr: "content".into()
}
),
other => panic!("cover 应为属性抽取叶子,实际 {other:?}"),
}
}
#[test]
fn http_cookies_and_warmup_parsed() {
let bs = BookSource::from_json(BILIXS_V2).unwrap();
assert_eq!(bs.http.warmup, vec!["https://www.bilixs.com/"]);
assert_eq!(bs.http.charset, Charset::Auto);
assert_eq!(bs.http.retry.as_ref().unwrap().backoff_ms, 500);
}
#[test]
fn sample_expectations_parsed() {
let bs = BookSource::from_json(BILIXS_V2).unwrap();
let s = &bs.samples[0];
assert_eq!(s.expect.volumes, Some(8));
assert_eq!(s.expect.min_chapters, Some(2000));
}
#[test]
fn round_trips_through_json() {
let bs = BookSource::from_json(BILIXS_V2).unwrap();
let json = serde_json::to_string(&bs).unwrap();
let bs2 = BookSource::from_json(&json).unwrap();
assert_eq!(bs, bs2);
}
#[test]
fn rejects_unknown_top_level_field() {
let bad = BILIXS_V2.replacen("\"name\":", "\"nmae\":", 1);
assert!(
BookSource::from_json(&bad).is_err(),
"拼错字段应被 deny_unknown_fields 拒绝"
);
}
}
#[cfg(all(test, feature = "schema"))]
mod schema_sync {
#[test]
fn schema_is_in_sync() {
let generated =
serde_json::to_string_pretty(&schemars::schema_for!(crate::BookSource)).unwrap();
let committed = include_str!("../book-source.schema.json");
assert_eq!(
generated.trim(),
committed.trim(),
"book-source.schema.json 与配置类型不同步;请重新生成:\n \
cargo run -p parse-book-source --features schema --example gen_schema \
> crates/parse-book-source/book-source.schema.json"
);
}
}