1use bitflags::bitflags;
2use scraper::selectable::Selectable;
3use scraper::{Html, Selector};
4use std::collections::HashSet;
5use std::io::{self};
6
7use super::helpers::*;
8
9pub use json_api::IssueJson;
10pub use json_api::PageJson;
11
12pub struct ScraperOptions {
14 pub keep_images: bool,
16 pub formats: FormatFlags,
18 pub already_downloaded: HashSet<String>,
20 pub archive_file: Option<String>,
22 pub skip_download: bool,
24 pub download_attempts: u32,
26 pub verbose: bool,
28}
29
30impl Default for ScraperOptions {
31 fn default() -> Self {
32 Self {
33 keep_images: false,
34 formats: FormatFlags::Pdf,
35 already_downloaded: HashSet::new(),
36 archive_file: None,
37 skip_download: false,
38 download_attempts: 3,
39 verbose: false,
40 }
41 }
42}
43
44bitflags! {
45 #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
47 pub struct FormatFlags:u32 {
48 const None = 0b000;
49 const Pdf = 0b001;
50 const Cbz = 0b010;
51 const All = 0b011;
52 }
53}
54
55#[derive(Debug, PartialEq, Eq)]
57pub struct BookMetadata {
58 pub id: String,
60 pub title: String,
62 pub publish_date: String,
64 pub volume: String,
66 pub issn: String,
68 pub publisher: String,
70 pub description: String,
72 pub book_type: ContentType,
74 pub author: String,
76 pub length: u32,
78 pub date_digitized: String,
80 pub orig_from: String,
82}
83
84mod json_api {
86 use serde::{Deserialize, Serialize};
87
88 #[derive(Serialize, Deserialize)]
90 pub struct IssueJson {
91 pub page: Vec<PageJson>,
92 }
93
94 #[derive(Serialize, Deserialize)]
96 pub struct PageJson {
97 pub pid: String,
98 pub src: Option<String>,
99 pub additional_info: Option<PageAdditionalInfo>,
100 }
101
102 #[derive(Serialize, Deserialize)]
104 pub struct PageAdditionalInfo {
105 #[serde(rename(deserialize = "[NewspaperJSONPageInfo]"))]
106 pub newspaper_json_page_info: Option<NewspaperJsonPageInfo>,
107 }
108
109 #[derive(Serialize, Deserialize)]
111 pub struct NewspaperJsonPageInfo {
112 #[serde(rename(deserialize = "tileres"))]
113 pub tile_res: Vec<TileRes>,
114 pub page_scanjob_coordinates: Coordinates,
115 }
116
117 #[derive(Serialize, Deserialize)]
118 pub struct TileRes {
119 #[serde(rename(deserialize = "h"))]
120 pub height: u32,
121 #[serde(rename(deserialize = "w"))]
122 pub width: u32,
123 #[serde(rename(deserialize = "z"))]
124 pub zoom: u32,
125 }
126
127 #[derive(Serialize, Deserialize)]
128 pub struct Coordinates {
129 pub x: u32,
130 pub y: u32,
131 }
132}
133
134#[derive(Debug, PartialEq, Eq)]
135pub enum ContentType {
136 Book,
137 Magazine,
138 Newspaper,
139}
140
141#[derive(Debug, PartialEq, Eq)]
142pub enum DownloadStatus {
143 Skipped,
144 Complete(BookMetadata),
145}
146
147impl BookMetadata {
148 const SUFFIX_PAGES: &'static str = " pages";
149 const PREFIX_PUBLISHER: &'static str = "Published by ";
150 const PREFIX_ISSN: &'static str = "ISSN ";
151
152 const LABEL_TITLE: &'static str = "Title";
153 const LABEL_AUTHOR: &'static str = "Author";
154 const LABEL_PUBLISHER: &'static str = "Publisher";
155 const LABEL_ORIG_FROM: &'static str = "Original from";
156 const LABEL_DIGITIZED: &'static str = "Digitized";
157 const LABEL_LENGTH: &'static str = "Length";
158 const LABEL_ISBN: &'static str = "ISBN";
159
160 pub fn get_title(&self) -> &str {
162 match self.book_type {
163 ContentType::Magazine | ContentType::Newspaper => &self.publish_date,
164 ContentType::Book => &self.title,
165 }
166 }
167
168 pub fn get_full_title(&self) -> String {
170 match self.book_type {
171 ContentType::Magazine | ContentType::Newspaper => {
172 std::format!("{} - {}", &self.title, &self.publish_date)
173 }
174 ContentType::Book => self.title.to_string(),
175 }
176 }
177
178 fn parse_length(text: &str) -> io::Result<u32> {
179 Ok(Self::remove_and_extract(text, Self::SUFFIX_PAGES)
180 .parse::<u32>()
181 .to_result()?)
182 }
183
184 fn remove_and_extract(source: &str, to_remove: &str) -> String {
185 source.replace(to_remove, "").trim().to_string()
186 }
187
188 pub fn from_page(id: &str, doc: &Html) -> io::Result<BookMetadata> {
190 let element = doc
191 .select(&Selector::parse("#summary_content_table").to_result()?)
192 .next()
193 .to_result("Metadata could not be parsed.")?;
194
195 let mut title = match element
196 .select(&Selector::parse(".booktitle").to_result()?)
197 .next()
198 .and_then(|e| e.text().next())
199 {
200 Some(x) => x.to_string(),
201 _ => String::new(),
202 };
203
204 let description = match element
205 .select(&Selector::parse("#synopsistext").to_result()?)
206 .next()
207 .and_then(|e| e.text().next())
208 {
209 Some(x) => x.to_string(),
210 _ => String::new(),
211 };
212
213 let mut publish_date = String::new();
214 let mut volume = String::new();
215 let mut issn = String::new();
216 let mut publisher = String::new();
217 let mut author = String::new();
218 let mut length = 0;
219 let mut date_digitized = String::new();
220 let mut orig_from = String::new();
221 let mut isbn = Vec::<String>::new();
222
223 if let Some(e) = element
225 .select(&Selector::parse("#metadata").to_result()?)
226 .next()
227 {
228 let mut i: u32 = 0;
229 for child in e.text() {
230 if i == 0 {
231 publish_date = child.to_string();
232 } else if child.starts_with(Self::PREFIX_PUBLISHER) {
233 publisher = Self::remove_and_extract(child, Self::PREFIX_PUBLISHER);
234 } else if child.starts_with(Self::PREFIX_ISSN) {
235 issn = Self::remove_and_extract(child, Self::PREFIX_ISSN);
236 } else if child.ends_with(Self::SUFFIX_PAGES) {
237 length = Self::parse_length(child)?;
238 } else {
239 volume = child.to_string();
240 }
241
242 i += 1;
243 }
244 };
245
246 for tr in doc.select(&Selector::parse(".metadata_row").to_result()?) {
248 if let Some(label) = tr
249 .select(&Selector::parse(".metadata_label").to_result()?)
250 .next()
251 .and_then(|e| e.text().next())
252 {
253 if let Some(value) = tr
254 .select(&Selector::parse(".metadata_value span").to_result()?)
255 .next()
256 .and_then(|e| e.text().next())
257 {
258 match label {
259 Self::LABEL_TITLE => {
260 title = value.to_string();
261 }
262 Self::LABEL_AUTHOR => {
263 author = value.to_string();
264 }
265 Self::LABEL_PUBLISHER => {
266 publisher = value.to_string();
267 }
268 Self::LABEL_ORIG_FROM => {
269 orig_from = value.to_string();
270 }
271 Self::LABEL_DIGITIZED => {
272 date_digitized = value.to_string();
273 }
274 Self::LABEL_ISBN => {
275 value
276 .split(",")
277 .for_each(|x| isbn.push(x.trim().to_string()));
278 }
279 Self::LABEL_LENGTH => {
280 length = Self::parse_length(value)?;
281 }
282 _ => (),
283 }
284 }
285 }
286 }
287
288 let book_type = match doc
290 .select(&Selector::parse("#preview-link span").to_result()?)
291 .next()
292 .and_then(|e| e.text().next())
293 {
294 Some(x) => {
295 if x.contains("magazine") {
296 ContentType::Magazine
297 } else if x.contains("newspaper") {
298 ContentType::Newspaper
299 } else {
300 ContentType::Book
301 }
302 }
303 _ => ContentType::Book,
304 };
305
306 Ok(BookMetadata {
307 id: id.to_string(),
308 title,
309 publish_date,
310 volume,
311 issn,
312 publisher,
313 description,
314 book_type,
315 author,
316 length,
317 date_digitized,
318 orig_from,
319 })
320 }
321}