Skip to main content

gbscraper/scraper/
types.rs

1use bitflags::bitflags;
2use scraper::selectable::Selectable;
3use scraper::{Html, Selector};
4use std::io::{self};
5
6use super::helpers::*;
7
8pub use json_api::IssueJson;
9pub use json_api::PageJson;
10
11pub const FALLBACK_TLD: &str = ".us";
12
13/// Scrape options.
14pub struct ScraperOptions {
15    /// If true, downloaded images will not be deleted after conversion.
16    pub keep_images: bool,
17    /// Format(s) to convert downloaded images to.
18    pub formats: FormatFlags,
19    /// File to store IDs of already downloaded books.
20    pub archive_file: Option<String>,
21    /// If true, only retrieve metadata without downloading or processing images.
22    pub skip_download: bool,
23    /// Number of times to attempt to download any file before giving up on a book. Set to 0 to try indefinitely.
24    pub download_attempts: u32,
25    /// If true, extra output will be given.
26    pub verbose: bool,
27    /// Top level domain to use for URLs.
28    pub tld: String,
29}
30
31impl Default for ScraperOptions {
32    fn default() -> Self {
33        Self {
34            keep_images: false,
35            formats: FormatFlags::Pdf,
36            archive_file: None,
37            skip_download: false,
38            download_attempts: 3,
39            verbose: false,
40            tld: FALLBACK_TLD.to_string(),
41        }
42    }
43}
44
45bitflags! {
46    /// Format(s) downloaded images to
47    #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
48    pub struct FormatFlags:u32 {
49        const None = 0b000;
50        const Pdf =  0b001;
51        const Cbz =  0b010;
52        const All =  0b011;
53    }
54}
55
56/// Metadata for book or individual issue of magazine.
57#[derive(Debug, PartialEq, Eq)]
58pub struct BookMetadata {
59    /// ID used to identify book resource
60    pub id: String,
61    /// Title of book or periodical
62    pub title: String,
63    /// Date issue was published
64    pub publish_date: String,
65    /// Volume of issue
66    pub volume: String,
67    /// ISSN of publication
68    pub issn: String,
69    /// Publisher
70    pub publisher: String,
71    /// Description of publication
72    pub description: String,
73    /// Type of book
74    pub book_type: ContentType,
75    /// Author of the book
76    pub author: String,
77    /// Number of pages
78    pub length: u32,
79    /// Date book was digitized
80    pub date_digitized: String,
81    /// Source of book
82    pub orig_from: String,
83}
84
85/// Data types deserializing JSON API calls to get book info.
86mod json_api {
87    use serde::{Deserialize, Serialize};
88
89    /// Result of API call to get metadata about a book or issue.
90    #[derive(Serialize, Deserialize)]
91    pub struct IssueJson {
92        pub page: Vec<PageJson>,
93    }
94
95    /// Metadata pertaining to specific page.
96    #[derive(Serialize, Deserialize)]
97    pub struct PageJson {
98        pub pid: String,
99        pub src: Option<String>,
100        pub additional_info: Option<PageAdditionalInfo>,
101    }
102
103    /// Additional metadata for specific page.
104    #[derive(Serialize, Deserialize)]
105    pub struct PageAdditionalInfo {
106        #[serde(rename(deserialize = "[NewspaperJSONPageInfo]"))]
107        pub newspaper_json_page_info: Option<NewspaperJsonPageInfo>,
108    }
109
110    /// Additional metadata for newspaper pages.
111    #[derive(Serialize, Deserialize)]
112    pub struct NewspaperJsonPageInfo {
113        #[serde(rename(deserialize = "tileres"))]
114        pub tile_res: Vec<TileRes>,
115        pub page_scanjob_coordinates: Coordinates,
116    }
117
118    #[derive(Serialize, Deserialize)]
119    pub struct TileRes {
120        #[serde(rename(deserialize = "h"))]
121        pub height: u32,
122        #[serde(rename(deserialize = "w"))]
123        pub width: u32,
124        #[serde(rename(deserialize = "z"))]
125        pub zoom: u32,
126    }
127
128    #[derive(Serialize, Deserialize)]
129    pub struct Coordinates {
130        pub x: u32,
131        pub y: u32,
132    }
133}
134
135#[derive(Debug, PartialEq, Eq)]
136pub enum ContentType {
137    Book,
138    Magazine,
139    Newspaper,
140}
141
142#[derive(Debug, PartialEq, Eq)]
143#[allow(clippy::large_enum_variant)]
144pub enum DownloadStatus {
145    Skipped,
146    Complete(BookMetadata),
147}
148
149impl BookMetadata {
150    const SUFFIX_PAGES: &'static str = " pages";
151    const PREFIX_PUBLISHER: &'static str = "Published by ";
152    const PREFIX_ISSN: &'static str = "ISSN ";
153
154    const LABEL_TITLE: &'static str = "Title";
155    const LABEL_AUTHOR: &'static str = "Author";
156    const LABEL_PUBLISHER: &'static str = "Publisher";
157    const LABEL_ORIG_FROM: &'static str = "Original from";
158    const LABEL_DIGITIZED: &'static str = "Digitized";
159    const LABEL_LENGTH: &'static str = "Length";
160    const LABEL_ISBN: &'static str = "ISBN";
161
162    /// Gets the shortest title identifying this book.
163    pub fn get_title(&self) -> &str {
164        match self.book_type {
165            ContentType::Magazine | ContentType::Newspaper => &self.publish_date,
166            ContentType::Book => &self.title,
167        }
168    }
169
170    /// Gets the full title of this book, including the series name if it is a magazine issue.
171    pub fn get_full_title(&self) -> String {
172        match self.book_type {
173            ContentType::Magazine | ContentType::Newspaper => {
174                std::format!("{} - {}", &self.title, &self.publish_date)
175            }
176            ContentType::Book => self.title.to_string(),
177        }
178    }
179
180    fn parse_length(text: &str) -> io::Result<u32> {
181        Self::remove_and_extract(text, Self::SUFFIX_PAGES)
182            .parse::<u32>()
183            .to_result()
184    }
185
186    fn remove_and_extract(source: &str, to_remove: &str) -> String {
187        source.replace(to_remove, "").trim().to_string()
188    }
189
190    /// Extracts metadata from webpage.
191    pub fn from_page(id: &str, doc: &Html) -> io::Result<BookMetadata> {
192        let element = doc
193            .select(&Selector::parse("#summary_content_table").to_result()?)
194            .next()
195            .to_result("Metadata could not be parsed.")?;
196
197        let mut title = match element
198            .select(&Selector::parse(".booktitle").to_result()?)
199            .next()
200            .and_then(|e| e.text().next())
201        {
202            Some(x) => x.to_string(),
203            _ => String::new(),
204        };
205
206        let description = match element
207            .select(&Selector::parse("#synopsistext").to_result()?)
208            .next()
209            .and_then(|e| e.text().next())
210        {
211            Some(x) => x.to_string(),
212            _ => String::new(),
213        };
214
215        let mut publish_date = String::new();
216        let mut volume = String::new();
217        let mut issn = String::new();
218        let mut publisher = String::new();
219        let mut author = String::new();
220        let mut length = 0;
221        let mut date_digitized = String::new();
222        let mut orig_from = String::new();
223        let mut isbn = Vec::<String>::new();
224
225        // Main metadata area
226        if let Some(e) = element
227            .select(&Selector::parse("#metadata").to_result()?)
228            .next()
229        {
230            for (i, child) in e.text().enumerate() {
231                if i == 0 {
232                    publish_date = child.to_string();
233                } else if child.starts_with(Self::PREFIX_PUBLISHER) {
234                    publisher = Self::remove_and_extract(child, Self::PREFIX_PUBLISHER);
235                } else if child.starts_with(Self::PREFIX_ISSN) {
236                    issn = Self::remove_and_extract(child, Self::PREFIX_ISSN);
237                } else if child.ends_with(Self::SUFFIX_PAGES) {
238                    length = Self::parse_length(child)?;
239                } else {
240                    volume = child.to_string();
241                }
242            }
243        };
244
245        // Bibliography area - used specifically by books?
246        for tr in doc.select(&Selector::parse(".metadata_row").to_result()?) {
247            if let Some(label) = tr
248                .select(&Selector::parse(".metadata_label").to_result()?)
249                .next()
250                .and_then(|e| e.text().next())
251            {
252                if let Some(value) = tr
253                    .select(&Selector::parse(".metadata_value span").to_result()?)
254                    .next()
255                    .and_then(|e| e.text().next())
256                {
257                    match label {
258                        Self::LABEL_TITLE => {
259                            title = value.to_string();
260                        }
261                        Self::LABEL_AUTHOR => {
262                            author = value.to_string();
263                        }
264                        Self::LABEL_PUBLISHER => {
265                            publisher = value.to_string();
266                        }
267                        Self::LABEL_ORIG_FROM => {
268                            orig_from = value.to_string();
269                        }
270                        Self::LABEL_DIGITIZED => {
271                            date_digitized = value.to_string();
272                        }
273                        Self::LABEL_ISBN => {
274                            value
275                                .split(",")
276                                .for_each(|x| isbn.push(x.trim().to_string()));
277                        }
278                        Self::LABEL_LENGTH => {
279                            length = Self::parse_length(value)?;
280                        }
281                        _ => (),
282                    }
283                }
284            }
285        }
286
287        // Determine content type from text in preview link
288        let book_type = match doc
289            .select(&Selector::parse("#preview-link span").to_result()?)
290            .next()
291            .and_then(|e| e.text().next())
292        {
293            Some(x) => {
294                if x.contains("magazine") {
295                    ContentType::Magazine
296                } else if x.contains("newspaper") {
297                    ContentType::Newspaper
298                } else {
299                    ContentType::Book
300                }
301            }
302            _ => ContentType::Book,
303        };
304
305        Ok(BookMetadata {
306            id: id.to_string(),
307            title,
308            publish_date,
309            volume,
310            issn,
311            publisher,
312            description,
313            book_type,
314            author,
315            length,
316            date_digitized,
317            orig_from,
318        })
319    }
320}