1use bitflags::bitflags;
2use scraper::selectable::Selectable;
3use scraper::{Html, Selector};
4use std::collections::HashSet;
5use std::io::{self};
6
7use super::helpers::*;
8
9pub use json_api::IssueJson;
10pub use json_api::PageJson;
11
12pub struct ScraperOptions {
14 pub keep_images: bool,
16 pub formats: FormatFlags,
18 pub already_downloaded: HashSet<String>,
20 pub archive_file: Option<String>,
22 pub skip_download: bool,
24 pub download_attempts: u32,
26 pub verbose: bool,
28}
29
30impl Default for ScraperOptions {
31 fn default() -> Self {
32 Self {
33 keep_images: false,
34 formats: FormatFlags::Pdf,
35 already_downloaded: HashSet::new(),
36 archive_file: None,
37 skip_download: false,
38 download_attempts: 3,
39 verbose: false,
40 }
41 }
42}
43
44bitflags! {
45 #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
47 pub struct FormatFlags:u32 {
48 const None = 0b000;
49 const Pdf = 0b001;
50 const Cbz = 0b010;
51 const All = 0b011;
52 }
53}
54
55#[derive(Debug, PartialEq, Eq)]
57pub struct BookMetadata {
58 pub id: String,
60 pub title: String,
62 pub publish_date: String,
64 pub volume: String,
66 pub issn: String,
68 pub publisher: String,
70 pub description: String,
72 pub book_type: ContentType,
74 pub author: String,
76 pub length: u32,
78 pub date_digitized: String,
80 pub orig_from: String,
82}
83
84mod json_api {
86 use serde::{Deserialize, Serialize};
87
88 #[derive(Serialize, Deserialize)]
90 pub struct IssueJson {
91 pub page: Vec<PageJson>,
92 }
93
94 #[derive(Serialize, Deserialize)]
96 pub struct PageJson {
97 pub pid: String,
98 pub src: Option<String>,
99 pub additional_info: Option<PageAdditionalInfo>,
100 }
101
102 #[derive(Serialize, Deserialize)]
104 pub struct PageAdditionalInfo {
105 #[serde(rename(deserialize = "[NewspaperJSONPageInfo]"))]
106 pub newspaper_json_page_info: Option<NewspaperJsonPageInfo>,
107 }
108
109 #[derive(Serialize, Deserialize)]
111 pub struct NewspaperJsonPageInfo {
112 #[serde(rename(deserialize = "tileres"))]
113 pub tile_res: Vec<TileRes>,
114 pub page_scanjob_coordinates: Coordinates,
115 }
116
117 #[derive(Serialize, Deserialize)]
118 pub struct TileRes {
119 #[serde(rename(deserialize = "h"))]
120 pub height: u32,
121 #[serde(rename(deserialize = "w"))]
122 pub width: u32,
123 #[serde(rename(deserialize = "z"))]
124 pub zoom: u32,
125 }
126
127 #[derive(Serialize, Deserialize)]
128 pub struct Coordinates {
129 pub x: u32,
130 pub y: u32,
131 }
132}
133
134#[derive(Debug, PartialEq, Eq)]
135pub enum ContentType {
136 Book,
137 Magazine,
138 Newspaper,
139}
140
141#[derive(Debug, PartialEq, Eq)]
142#[allow(clippy::large_enum_variant)]
143pub enum DownloadStatus {
144 Skipped,
145 Complete(BookMetadata),
146}
147
148impl BookMetadata {
149 const SUFFIX_PAGES: &'static str = " pages";
150 const PREFIX_PUBLISHER: &'static str = "Published by ";
151 const PREFIX_ISSN: &'static str = "ISSN ";
152
153 const LABEL_TITLE: &'static str = "Title";
154 const LABEL_AUTHOR: &'static str = "Author";
155 const LABEL_PUBLISHER: &'static str = "Publisher";
156 const LABEL_ORIG_FROM: &'static str = "Original from";
157 const LABEL_DIGITIZED: &'static str = "Digitized";
158 const LABEL_LENGTH: &'static str = "Length";
159 const LABEL_ISBN: &'static str = "ISBN";
160
161 pub fn get_title(&self) -> &str {
163 match self.book_type {
164 ContentType::Magazine | ContentType::Newspaper => &self.publish_date,
165 ContentType::Book => &self.title,
166 }
167 }
168
169 pub fn get_full_title(&self) -> String {
171 match self.book_type {
172 ContentType::Magazine | ContentType::Newspaper => {
173 std::format!("{} - {}", &self.title, &self.publish_date)
174 }
175 ContentType::Book => self.title.to_string(),
176 }
177 }
178
179 fn parse_length(text: &str) -> io::Result<u32> {
180 Self::remove_and_extract(text, Self::SUFFIX_PAGES)
181 .parse::<u32>()
182 .to_result()
183 }
184
185 fn remove_and_extract(source: &str, to_remove: &str) -> String {
186 source.replace(to_remove, "").trim().to_string()
187 }
188
189 pub fn from_page(id: &str, doc: &Html) -> io::Result<BookMetadata> {
191 let element = doc
192 .select(&Selector::parse("#summary_content_table").to_result()?)
193 .next()
194 .to_result("Metadata could not be parsed.")?;
195
196 let mut title = match element
197 .select(&Selector::parse(".booktitle").to_result()?)
198 .next()
199 .and_then(|e| e.text().next())
200 {
201 Some(x) => x.to_string(),
202 _ => String::new(),
203 };
204
205 let description = match element
206 .select(&Selector::parse("#synopsistext").to_result()?)
207 .next()
208 .and_then(|e| e.text().next())
209 {
210 Some(x) => x.to_string(),
211 _ => String::new(),
212 };
213
214 let mut publish_date = String::new();
215 let mut volume = String::new();
216 let mut issn = String::new();
217 let mut publisher = String::new();
218 let mut author = String::new();
219 let mut length = 0;
220 let mut date_digitized = String::new();
221 let mut orig_from = String::new();
222 let mut isbn = Vec::<String>::new();
223
224 if let Some(e) = element
226 .select(&Selector::parse("#metadata").to_result()?)
227 .next()
228 {
229 for (i, child) in e.text().enumerate() {
230 if i == 0 {
231 publish_date = child.to_string();
232 } else if child.starts_with(Self::PREFIX_PUBLISHER) {
233 publisher = Self::remove_and_extract(child, Self::PREFIX_PUBLISHER);
234 } else if child.starts_with(Self::PREFIX_ISSN) {
235 issn = Self::remove_and_extract(child, Self::PREFIX_ISSN);
236 } else if child.ends_with(Self::SUFFIX_PAGES) {
237 length = Self::parse_length(child)?;
238 } else {
239 volume = child.to_string();
240 }
241 }
242 };
243
244 for tr in doc.select(&Selector::parse(".metadata_row").to_result()?) {
246 if let Some(label) = tr
247 .select(&Selector::parse(".metadata_label").to_result()?)
248 .next()
249 .and_then(|e| e.text().next())
250 {
251 if let Some(value) = tr
252 .select(&Selector::parse(".metadata_value span").to_result()?)
253 .next()
254 .and_then(|e| e.text().next())
255 {
256 match label {
257 Self::LABEL_TITLE => {
258 title = value.to_string();
259 }
260 Self::LABEL_AUTHOR => {
261 author = value.to_string();
262 }
263 Self::LABEL_PUBLISHER => {
264 publisher = value.to_string();
265 }
266 Self::LABEL_ORIG_FROM => {
267 orig_from = value.to_string();
268 }
269 Self::LABEL_DIGITIZED => {
270 date_digitized = value.to_string();
271 }
272 Self::LABEL_ISBN => {
273 value
274 .split(",")
275 .for_each(|x| isbn.push(x.trim().to_string()));
276 }
277 Self::LABEL_LENGTH => {
278 length = Self::parse_length(value)?;
279 }
280 _ => (),
281 }
282 }
283 }
284 }
285
286 let book_type = match doc
288 .select(&Selector::parse("#preview-link span").to_result()?)
289 .next()
290 .and_then(|e| e.text().next())
291 {
292 Some(x) => {
293 if x.contains("magazine") {
294 ContentType::Magazine
295 } else if x.contains("newspaper") {
296 ContentType::Newspaper
297 } else {
298 ContentType::Book
299 }
300 }
301 _ => ContentType::Book,
302 };
303
304 Ok(BookMetadata {
305 id: id.to_string(),
306 title,
307 publish_date,
308 volume,
309 issn,
310 publisher,
311 description,
312 book_type,
313 author,
314 length,
315 date_digitized,
316 orig_from,
317 })
318 }
319}