1use bitflags::bitflags;
2use scraper::selectable::Selectable;
3use scraper::{Html, Selector};
4use std::io::{self};
5
6use super::helpers::*;
7
8pub use json_api::IssueJson;
9pub use json_api::PageJson;
10
11pub const FALLBACK_TLD: &str = ".us";
12
13pub struct ScraperOptions {
15 pub keep_images: bool,
17 pub formats: FormatFlags,
19 pub archive_file: Option<String>,
21 pub skip_download: bool,
23 pub download_attempts: u32,
25 pub verbose: bool,
27 pub tld: String,
29}
30
31impl Default for ScraperOptions {
32 fn default() -> Self {
33 Self {
34 keep_images: false,
35 formats: FormatFlags::Pdf,
36 archive_file: None,
37 skip_download: false,
38 download_attempts: 3,
39 verbose: false,
40 tld: FALLBACK_TLD.to_string(),
41 }
42 }
43}
44
45bitflags! {
46 #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
48 pub struct FormatFlags:u32 {
49 const None = 0b000;
50 const Pdf = 0b001;
51 const Cbz = 0b010;
52 const All = 0b011;
53 }
54}
55
56#[derive(Debug, PartialEq, Eq)]
58pub struct BookMetadata {
59 pub id: String,
61 pub title: String,
63 pub publish_date: String,
65 pub volume: String,
67 pub issn: String,
69 pub publisher: String,
71 pub description: String,
73 pub book_type: ContentType,
75 pub author: String,
77 pub length: u32,
79 pub date_digitized: String,
81 pub orig_from: String,
83}
84
85mod json_api {
87 use serde::{Deserialize, Serialize};
88
89 #[derive(Serialize, Deserialize)]
91 pub struct IssueJson {
92 pub page: Vec<PageJson>,
93 }
94
95 #[derive(Serialize, Deserialize)]
97 pub struct PageJson {
98 pub pid: String,
99 pub src: Option<String>,
100 pub additional_info: Option<PageAdditionalInfo>,
101 }
102
103 #[derive(Serialize, Deserialize)]
105 pub struct PageAdditionalInfo {
106 #[serde(rename(deserialize = "[NewspaperJSONPageInfo]"))]
107 pub newspaper_json_page_info: Option<NewspaperJsonPageInfo>,
108 }
109
110 #[derive(Serialize, Deserialize)]
112 pub struct NewspaperJsonPageInfo {
113 #[serde(rename(deserialize = "tileres"))]
114 pub tile_res: Vec<TileRes>,
115 pub page_scanjob_coordinates: Coordinates,
116 }
117
118 #[derive(Serialize, Deserialize)]
119 pub struct TileRes {
120 #[serde(rename(deserialize = "h"))]
121 pub height: u32,
122 #[serde(rename(deserialize = "w"))]
123 pub width: u32,
124 #[serde(rename(deserialize = "z"))]
125 pub zoom: u32,
126 }
127
128 #[derive(Serialize, Deserialize)]
129 pub struct Coordinates {
130 pub x: u32,
131 pub y: u32,
132 }
133}
134
135#[derive(Debug, PartialEq, Eq)]
136pub enum ContentType {
137 Book,
138 Magazine,
139 Newspaper,
140}
141
142#[derive(Debug, PartialEq, Eq)]
143#[allow(clippy::large_enum_variant)]
144pub enum DownloadStatus {
145 Skipped,
146 Complete(BookMetadata),
147}
148
149impl BookMetadata {
150 const SUFFIX_PAGES: &'static str = " pages";
151 const PREFIX_PUBLISHER: &'static str = "Published by ";
152 const PREFIX_ISSN: &'static str = "ISSN ";
153
154 const LABEL_TITLE: &'static str = "Title";
155 const LABEL_AUTHOR: &'static str = "Author";
156 const LABEL_PUBLISHER: &'static str = "Publisher";
157 const LABEL_ORIG_FROM: &'static str = "Original from";
158 const LABEL_DIGITIZED: &'static str = "Digitized";
159 const LABEL_LENGTH: &'static str = "Length";
160 const LABEL_ISBN: &'static str = "ISBN";
161
162 pub fn get_title(&self) -> &str {
164 match self.book_type {
165 ContentType::Magazine | ContentType::Newspaper => &self.publish_date,
166 ContentType::Book => &self.title,
167 }
168 }
169
170 pub fn get_full_title(&self) -> String {
172 match self.book_type {
173 ContentType::Magazine | ContentType::Newspaper => {
174 std::format!("{} - {}", &self.title, &self.publish_date)
175 }
176 ContentType::Book => self.title.to_string(),
177 }
178 }
179
180 fn parse_length(text: &str) -> io::Result<u32> {
181 Self::remove_and_extract(text, Self::SUFFIX_PAGES)
182 .parse::<u32>()
183 .to_result()
184 }
185
186 fn remove_and_extract(source: &str, to_remove: &str) -> String {
187 source.replace(to_remove, "").trim().to_string()
188 }
189
190 pub fn from_page(id: &str, doc: &Html) -> io::Result<BookMetadata> {
192 let element = doc
193 .select(&Selector::parse("#summary_content_table").to_result()?)
194 .next()
195 .to_result("Metadata could not be parsed.")?;
196
197 let mut title = match element
198 .select(&Selector::parse(".booktitle").to_result()?)
199 .next()
200 .and_then(|e| e.text().next())
201 {
202 Some(x) => x.to_string(),
203 _ => String::new(),
204 };
205
206 let description = match element
207 .select(&Selector::parse("#synopsistext").to_result()?)
208 .next()
209 .and_then(|e| e.text().next())
210 {
211 Some(x) => x.to_string(),
212 _ => String::new(),
213 };
214
215 let mut publish_date = String::new();
216 let mut volume = String::new();
217 let mut issn = String::new();
218 let mut publisher = String::new();
219 let mut author = String::new();
220 let mut length = 0;
221 let mut date_digitized = String::new();
222 let mut orig_from = String::new();
223 let mut isbn = Vec::<String>::new();
224
225 if let Some(e) = element
227 .select(&Selector::parse("#metadata").to_result()?)
228 .next()
229 {
230 for (i, child) in e.text().enumerate() {
231 if i == 0 {
232 publish_date = child.to_string();
233 } else if child.starts_with(Self::PREFIX_PUBLISHER) {
234 publisher = Self::remove_and_extract(child, Self::PREFIX_PUBLISHER);
235 } else if child.starts_with(Self::PREFIX_ISSN) {
236 issn = Self::remove_and_extract(child, Self::PREFIX_ISSN);
237 } else if child.ends_with(Self::SUFFIX_PAGES) {
238 length = Self::parse_length(child)?;
239 } else {
240 volume = child.to_string();
241 }
242 }
243 };
244
245 for tr in doc.select(&Selector::parse(".metadata_row").to_result()?) {
247 if let Some(label) = tr
248 .select(&Selector::parse(".metadata_label").to_result()?)
249 .next()
250 .and_then(|e| e.text().next())
251 {
252 if let Some(value) = tr
253 .select(&Selector::parse(".metadata_value span").to_result()?)
254 .next()
255 .and_then(|e| e.text().next())
256 {
257 match label {
258 Self::LABEL_TITLE => {
259 title = value.to_string();
260 }
261 Self::LABEL_AUTHOR => {
262 author = value.to_string();
263 }
264 Self::LABEL_PUBLISHER => {
265 publisher = value.to_string();
266 }
267 Self::LABEL_ORIG_FROM => {
268 orig_from = value.to_string();
269 }
270 Self::LABEL_DIGITIZED => {
271 date_digitized = value.to_string();
272 }
273 Self::LABEL_ISBN => {
274 value
275 .split(",")
276 .for_each(|x| isbn.push(x.trim().to_string()));
277 }
278 Self::LABEL_LENGTH => {
279 length = Self::parse_length(value)?;
280 }
281 _ => (),
282 }
283 }
284 }
285 }
286
287 let book_type = match doc
289 .select(&Selector::parse("#preview-link span").to_result()?)
290 .next()
291 .and_then(|e| e.text().next())
292 {
293 Some(x) => {
294 if x.contains("magazine") {
295 ContentType::Magazine
296 } else if x.contains("newspaper") {
297 ContentType::Newspaper
298 } else {
299 ContentType::Book
300 }
301 }
302 _ => ContentType::Book,
303 };
304
305 Ok(BookMetadata {
306 id: id.to_string(),
307 title,
308 publish_date,
309 volume,
310 issn,
311 publisher,
312 description,
313 book_type,
314 author,
315 length,
316 date_digitized,
317 orig_from,
318 })
319 }
320}