parse_book_source/
lib.rs

1use anyhow::anyhow;
2use serde_json::json;
3
4pub mod analyzer;
5pub mod book;
6pub mod book_source;
7pub mod error;
8pub mod http_client;
9pub mod utils;
10pub use analyzer::*;
11pub use book::*;
12pub use book_source::*;
13pub use error::*;
14pub use http_client::*;
15
16#[derive(Debug, Clone)]
17pub struct BookSourceParser {
18    pub book_source: BookSource,
19    pub http_client: HttpClient,
20    pub analyzer: AnalyzerManager,
21    pub temp: Option<String>,
22}
23
24impl TryFrom<BookSource> for BookSourceParser {
25    type Error = ParseError;
26
27    fn try_from(book_source: BookSource) -> Result<Self> {
28        let mut http_config = book_source.http_config.clone();
29        if let Some(ref header) = book_source.header {
30            http_config.header = Some(serde_json::from_str(header)?);
31        }
32
33        if let Some(ref response_time) = book_source.respond_time {
34            http_config.timeout = Some(*response_time);
35        }
36
37        Ok(Self {
38            http_client: HttpClient::new(&book_source.book_source_url, &http_config)?,
39            book_source,
40            analyzer: AnalyzerManager::new()?,
41            temp: None,
42        })
43    }
44}
45
46impl BookSourceParser {
47    pub fn new(book_source: BookSource) -> Result<Self> {
48        Self::try_from(book_source)
49    }
50
51    /// 获取分类信息
52    pub async fn get_explores(&mut self) -> Result<ExploreList> {
53        if let Some(ref explore_url) = self.book_source.explore_url {
54            if let Some(ref rule_explore_item) = self.book_source.rule_explore_item {
55                let res = self
56                    .http_client
57                    .get(&self.book_source.book_source_url)
58                    .await?
59                    .text()
60                    .await?;
61
62                let list = self.analyzer.get_element(explore_url, &res)?;
63
64                let res = list
65                    .into_iter()
66                    .flat_map(|item| {
67                        rule_explore_item.parse_to_explore_item(&mut self.analyzer, &item)
68                    })
69                    .collect::<Vec<_>>();
70                return Ok(res);
71            } else {
72                return Ok(serde_json::from_str(explore_url)?);
73            }
74        }
75
76        Ok(vec![])
77    }
78
79    /// 搜索书籍
80    pub async fn search_books(&mut self, key: &str, page: u32, page_size: u32) -> Result<BookList> {
81        let url = self.analyzer.get_string(
82            &self.book_source.search_url,
83            "",
84            Some(json!({
85                "key": key,
86                "page": page,
87                "page_size": page_size,
88            })),
89        )?;
90
91        let mut res = String::new();
92
93        for i in url.split(",") {
94            res = self.http_client.get(i).await?.text().await?;
95        }
96
97        let list = self
98            .analyzer
99            .get_element(&self.book_source.rule_search.book_list, &res)?;
100
101        let res = list
102            .into_iter()
103            .flat_map(|item| {
104                self.book_source
105                    .rule_search
106                    .parse_to_book_list_item(&mut self.analyzer, &item)
107            })
108            .collect::<Vec<_>>();
109
110        Ok(res)
111    }
112
113    /// 使用explore_item的url获取书籍列表
114    pub async fn explore_books(
115        &mut self,
116        url: &str,
117        page: u32,
118        page_size: u32,
119    ) -> Result<BookList> {
120        if self.book_source.rule_explore.is_none() {
121            return Err(anyhow!("explore rule is none").into());
122        }
123        let url = self.analyzer.get_string(
124            url,
125            "",
126            Some(json!({
127                "page": page,
128                "page_size": page_size,
129            })),
130        )?;
131
132        let res = self.http_client.get(url.as_str()).await?.text().await?;
133
134        let list = self.analyzer.get_element(
135            &self.book_source.rule_explore.as_ref().unwrap().book_list,
136            &res,
137        )?;
138
139        let res = list
140            .into_iter()
141            .flat_map(|item| {
142                self.book_source
143                    .rule_explore
144                    .as_ref()
145                    .unwrap()
146                    .parse_to_book_list_item(&mut self.analyzer, &item)
147            })
148            .collect::<Vec<_>>();
149
150        Ok(res)
151    }
152
153    /// 获取书籍信息
154    pub async fn get_book_info(&mut self, book_url: &str) -> Result<BookInfo> {
155        let res = self.http_client.get(book_url).await?.text().await?;
156
157        let book_info = self
158            .book_source
159            .rule_book_info
160            .parse_to_book_info(&mut self.analyzer, &res);
161
162        self.temp = Some(res);
163
164        book_info
165    }
166
167    pub async fn get_chapters(&mut self, toc_url: &str) -> Result<Vec<Chapter>> {
168        // 如果toc_url是http开头的url,直接请求
169        let res = if toc_url.starts_with("/") || toc_url.starts_with("http") {
170            self.http_client.get(toc_url).await?.text().await?
171        } else {
172            self.temp.clone().ok_or(anyhow!("temp is none"))?
173        };
174
175        let list = self
176            .analyzer
177            .get_element(&self.book_source.rule_toc.chapter_list, &res)?;
178
179        let res = list
180            .into_iter()
181            .flat_map(|item| {
182                self.book_source
183                    .rule_toc
184                    .parse_to_chapter(&mut self.analyzer, &item)
185            })
186            .collect::<Vec<_>>();
187
188        Ok(res)
189    }
190
191    pub async fn get_content(&mut self, chapter_url: &str) -> Result<String> {
192        let mut res = self.http_client.get(chapter_url).await?.text().await?;
193
194        match &self.book_source.rule_content {
195            RuleContent::One { content } => self.analyzer.get_string(content, &res, None),
196
197            RuleContent::More {
198                content,
199                next_content_url,
200                start,
201                end,
202            } => {
203                let end = self
204                    .analyzer
205                    .get_string(end, &res, None)?
206                    .parse::<usize>()?;
207                let mut contents = vec![];
208                let mut start = *start;
209
210                loop {
211                    let content = self.analyzer.get_string(content, &res, None)?;
212                    contents.push(content);
213
214                    if start > end {
215                        break;
216                    }
217
218                    let next_url = self.analyzer.get_string(
219                        next_content_url,
220                        &res,
221                        Some(json!({
222                            "index": start,
223                        })),
224                    )?;
225                    res = self
226                        .http_client
227                        .get(next_url.as_str())
228                        .await?
229                        .text()
230                        .await?;
231                    start += 1;
232                }
233
234                Ok(contents.join("  "))
235            }
236        }
237    }
238}