gbscraper/scraper/
types.rs

1use bitflags::bitflags;
2use scraper::selectable::Selectable;
3use scraper::{Html, Selector};
4use std::collections::HashSet;
5use std::io::{self};
6
7use super::helpers::*;
8
9pub use json_api::IssueJson;
10pub use json_api::PageJson;
11
12/// Scrape options.
13pub struct ScraperOptions {
14    /// If true, downloaded images will not be deleted after conversion.
15    pub keep_images: bool,
16    /// Format(s) to convert downloaded images to.
17    pub formats: FormatFlags,
18    /// IDs of issues to skip.
19    pub already_downloaded: HashSet<String>,
20    /// File to store IDs of already downloaded books.
21    pub archive_file: Option<String>,
22    /// If true, only retrieve metadata without downloading or processing images.
23    pub skip_download: bool,
24    /// Number of times to attempt to download any file before giving up on a book. Set to 0 to try indefinitely.
25    pub download_attempts: u32,
26    /// If true, extra output will be given.
27    pub verbose: bool,
28}
29
30impl Default for ScraperOptions {
31    fn default() -> Self {
32        Self {
33            keep_images: false,
34            formats: FormatFlags::Pdf,
35            already_downloaded: HashSet::new(),
36            archive_file: None,
37            skip_download: false,
38            download_attempts: 3,
39            verbose: false,
40        }
41    }
42}
43
44bitflags! {
45    /// Format(s) downloaded images to
46    #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
47    pub struct FormatFlags:u32 {
48        const None = 0b000;
49        const Pdf =  0b001;
50        const Cbz =  0b010;
51        const All =  0b011;
52    }
53}
54
55/// Metadata for book or individual issue of magazine.
56#[derive(Debug, PartialEq, Eq)]
57pub struct BookMetadata {
58    /// ID used to identify book resource
59    pub id: String,
60    /// Title of book or periodical
61    pub title: String,
62    /// Date issue was published
63    pub publish_date: String,
64    /// Volume of issue
65    pub volume: String,
66    /// ISSN of publication
67    pub issn: String,
68    /// Publisher
69    pub publisher: String,
70    /// Description of publication
71    pub description: String,
72    /// Type of book
73    pub book_type: ContentType,
74    /// Author of the book
75    pub author: String,
76    /// Number of pages
77    pub length: u32,
78    /// Date book was digitized
79    pub date_digitized: String,
80    /// Source of book
81    pub orig_from: String,
82}
83
84/// Data types deserializing JSON API calls to get book info.
85mod json_api {
86    use serde::{Deserialize, Serialize};
87
88    /// Result of API call to get metadata about a book or issue.
89    #[derive(Serialize, Deserialize)]
90    pub struct IssueJson {
91        pub page: Vec<PageJson>,
92    }
93
94    /// Metadata pertaining to specific page.
95    #[derive(Serialize, Deserialize)]
96    pub struct PageJson {
97        pub pid: String,
98        pub src: Option<String>,
99        pub additional_info: Option<PageAdditionalInfo>,
100    }
101
102    /// Additional metadata for specific page.
103    #[derive(Serialize, Deserialize)]
104    pub struct PageAdditionalInfo {
105        #[serde(rename(deserialize = "[NewspaperJSONPageInfo]"))]
106        pub newspaper_json_page_info: Option<NewspaperJsonPageInfo>,
107    }
108
109    /// Additional metadata for newspaper pages.
110    #[derive(Serialize, Deserialize)]
111    pub struct NewspaperJsonPageInfo {
112        #[serde(rename(deserialize = "tileres"))]
113        pub tile_res: Vec<TileRes>,
114        pub page_scanjob_coordinates: Coordinates,
115    }
116
117    #[derive(Serialize, Deserialize)]
118    pub struct TileRes {
119        #[serde(rename(deserialize = "h"))]
120        pub height: u32,
121        #[serde(rename(deserialize = "w"))]
122        pub width: u32,
123        #[serde(rename(deserialize = "z"))]
124        pub zoom: u32,
125    }
126
127    #[derive(Serialize, Deserialize)]
128    pub struct Coordinates {
129        pub x: u32,
130        pub y: u32,
131    }
132}
133
134#[derive(Debug, PartialEq, Eq)]
135pub enum ContentType {
136    Book,
137    Magazine,
138    Newspaper,
139}
140
141#[derive(Debug, PartialEq, Eq)]
142#[allow(clippy::large_enum_variant)]
143pub enum DownloadStatus {
144    Skipped,
145    Complete(BookMetadata),
146}
147
148impl BookMetadata {
149    const SUFFIX_PAGES: &'static str = " pages";
150    const PREFIX_PUBLISHER: &'static str = "Published by ";
151    const PREFIX_ISSN: &'static str = "ISSN ";
152
153    const LABEL_TITLE: &'static str = "Title";
154    const LABEL_AUTHOR: &'static str = "Author";
155    const LABEL_PUBLISHER: &'static str = "Publisher";
156    const LABEL_ORIG_FROM: &'static str = "Original from";
157    const LABEL_DIGITIZED: &'static str = "Digitized";
158    const LABEL_LENGTH: &'static str = "Length";
159    const LABEL_ISBN: &'static str = "ISBN";
160
161    /// Gets the shortest title identifying this book.
162    pub fn get_title(&self) -> &str {
163        match self.book_type {
164            ContentType::Magazine | ContentType::Newspaper => &self.publish_date,
165            ContentType::Book => &self.title,
166        }
167    }
168
169    /// Gets the full title of this book, including the series name if it is a magazine issue.
170    pub fn get_full_title(&self) -> String {
171        match self.book_type {
172            ContentType::Magazine | ContentType::Newspaper => {
173                std::format!("{} - {}", &self.title, &self.publish_date)
174            }
175            ContentType::Book => self.title.to_string(),
176        }
177    }
178
179    fn parse_length(text: &str) -> io::Result<u32> {
180        Self::remove_and_extract(text, Self::SUFFIX_PAGES)
181            .parse::<u32>()
182            .to_result()
183    }
184
185    fn remove_and_extract(source: &str, to_remove: &str) -> String {
186        source.replace(to_remove, "").trim().to_string()
187    }
188
189    /// Extracts metadata from webpage.
190    pub fn from_page(id: &str, doc: &Html) -> io::Result<BookMetadata> {
191        let element = doc
192            .select(&Selector::parse("#summary_content_table").to_result()?)
193            .next()
194            .to_result("Metadata could not be parsed.")?;
195
196        let mut title = match element
197            .select(&Selector::parse(".booktitle").to_result()?)
198            .next()
199            .and_then(|e| e.text().next())
200        {
201            Some(x) => x.to_string(),
202            _ => String::new(),
203        };
204
205        let description = match element
206            .select(&Selector::parse("#synopsistext").to_result()?)
207            .next()
208            .and_then(|e| e.text().next())
209        {
210            Some(x) => x.to_string(),
211            _ => String::new(),
212        };
213
214        let mut publish_date = String::new();
215        let mut volume = String::new();
216        let mut issn = String::new();
217        let mut publisher = String::new();
218        let mut author = String::new();
219        let mut length = 0;
220        let mut date_digitized = String::new();
221        let mut orig_from = String::new();
222        let mut isbn = Vec::<String>::new();
223
224        // Main metadata area
225        if let Some(e) = element
226            .select(&Selector::parse("#metadata").to_result()?)
227            .next()
228        {
229            for (i, child) in e.text().enumerate() {
230                if i == 0 {
231                    publish_date = child.to_string();
232                } else if child.starts_with(Self::PREFIX_PUBLISHER) {
233                    publisher = Self::remove_and_extract(child, Self::PREFIX_PUBLISHER);
234                } else if child.starts_with(Self::PREFIX_ISSN) {
235                    issn = Self::remove_and_extract(child, Self::PREFIX_ISSN);
236                } else if child.ends_with(Self::SUFFIX_PAGES) {
237                    length = Self::parse_length(child)?;
238                } else {
239                    volume = child.to_string();
240                }
241            }
242        };
243
244        // Bibliography area - used specifically by books?
245        for tr in doc.select(&Selector::parse(".metadata_row").to_result()?) {
246            if let Some(label) = tr
247                .select(&Selector::parse(".metadata_label").to_result()?)
248                .next()
249                .and_then(|e| e.text().next())
250            {
251                if let Some(value) = tr
252                    .select(&Selector::parse(".metadata_value span").to_result()?)
253                    .next()
254                    .and_then(|e| e.text().next())
255                {
256                    match label {
257                        Self::LABEL_TITLE => {
258                            title = value.to_string();
259                        }
260                        Self::LABEL_AUTHOR => {
261                            author = value.to_string();
262                        }
263                        Self::LABEL_PUBLISHER => {
264                            publisher = value.to_string();
265                        }
266                        Self::LABEL_ORIG_FROM => {
267                            orig_from = value.to_string();
268                        }
269                        Self::LABEL_DIGITIZED => {
270                            date_digitized = value.to_string();
271                        }
272                        Self::LABEL_ISBN => {
273                            value
274                                .split(",")
275                                .for_each(|x| isbn.push(x.trim().to_string()));
276                        }
277                        Self::LABEL_LENGTH => {
278                            length = Self::parse_length(value)?;
279                        }
280                        _ => (),
281                    }
282                }
283            }
284        }
285
286        // Determine content type from text in preview link
287        let book_type = match doc
288            .select(&Selector::parse("#preview-link span").to_result()?)
289            .next()
290            .and_then(|e| e.text().next())
291        {
292            Some(x) => {
293                if x.contains("magazine") {
294                    ContentType::Magazine
295                } else if x.contains("newspaper") {
296                    ContentType::Newspaper
297                } else {
298                    ContentType::Book
299                }
300            }
301            _ => ContentType::Book,
302        };
303
304        Ok(BookMetadata {
305            id: id.to_string(),
306            title,
307            publish_date,
308            volume,
309            issn,
310            publisher,
311            description,
312            book_type,
313            author,
314            length,
315            date_digitized,
316            orig_from,
317        })
318    }
319}
gbscraper/scraper/types.rs

gbscraper/scraper/
types.rs