gbscraper/scraper/
types.rs

1use bitflags::bitflags;
2use scraper::selectable::Selectable;
3use scraper::{Html, Selector};
4use std::collections::HashSet;
5use std::io::{self};
6
7use super::helpers::*;
8
9pub use json_api::IssueJson;
10pub use json_api::PageJson;
11
12/// Scrape options.
13pub struct ScraperOptions {
14    /// If true, downloaded images will not be deleted after conversion.
15    pub keep_images: bool,
16    /// Format(s) to convert downloaded images to.
17    pub formats: FormatFlags,
18    /// IDs of issues to skip.
19    pub already_downloaded: HashSet<String>,
20    /// File to store IDs of already downloaded books.
21    pub archive_file: Option<String>,
22    /// If true, only retrieve metadata without downloading or processing images.
23    pub skip_download: bool,
24    /// Number of times to attempt to download any file before giving up on a book. Set to 0 to try indefinitely.
25    pub download_attempts: u32,
26    /// If true, extra output will be given.
27    pub verbose: bool,
28}
29
30impl Default for ScraperOptions {
31    fn default() -> Self {
32        Self {
33            keep_images: false,
34            formats: FormatFlags::Pdf,
35            already_downloaded: HashSet::new(),
36            archive_file: None,
37            skip_download: false,
38            download_attempts: 3,
39            verbose: false,
40        }
41    }
42}
43
44bitflags! {
45    /// Format(s) downloaded images to
46    #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
47    pub struct FormatFlags:u32 {
48        const None = 0b000;
49        const Pdf =  0b001;
50        const Cbz =  0b010;
51        const All =  0b011;
52    }
53}
54
55/// Metadata for book or individual issue of magazine.
56#[derive(Debug, PartialEq, Eq)]
57pub struct BookMetadata {
58    /// ID used to identify book resource
59    pub id: String,
60    /// Title of book or periodical
61    pub title: String,
62    /// Date issue was published
63    pub publish_date: String,
64    /// Volume of issue
65    pub volume: String,
66    /// ISSN of publication
67    pub issn: String,
68    /// Publisher
69    pub publisher: String,
70    /// Description of publication
71    pub description: String,
72    /// Type of book
73    pub book_type: ContentType,
74    /// Author of the book
75    pub author: String,
76    /// Number of pages
77    pub length: u32,
78    /// Date book was digitized
79    pub date_digitized: String,
80    /// Source of book
81    pub orig_from: String,
82}
83
84/// Data types deserializing JSON API calls to get book info.
85mod json_api {
86    use serde::{Deserialize, Serialize};
87
88    /// Result of API call to get metadata about a book or issue.
89    #[derive(Serialize, Deserialize)]
90    pub struct IssueJson {
91        pub page: Vec<PageJson>,
92    }
93
94    /// Metadata pertaining to specific page.
95    #[derive(Serialize, Deserialize)]
96    pub struct PageJson {
97        pub pid: String,
98        pub src: Option<String>,
99        pub additional_info: Option<PageAdditionalInfo>,
100    }
101
102    /// Additional metadata for specific page.
103    #[derive(Serialize, Deserialize)]
104    pub struct PageAdditionalInfo {
105        #[serde(rename(deserialize = "[NewspaperJSONPageInfo]"))]
106        pub newspaper_json_page_info: Option<NewspaperJsonPageInfo>,
107    }
108
109    /// Additional metadata for newspaper pages.
110    #[derive(Serialize, Deserialize)]
111    pub struct NewspaperJsonPageInfo {
112        #[serde(rename(deserialize = "tileres"))]
113        pub tile_res: Vec<TileRes>,
114        pub page_scanjob_coordinates: Coordinates,
115    }
116
117    #[derive(Serialize, Deserialize)]
118    pub struct TileRes {
119        #[serde(rename(deserialize = "h"))]
120        pub height: u32,
121        #[serde(rename(deserialize = "w"))]
122        pub width: u32,
123        #[serde(rename(deserialize = "z"))]
124        pub zoom: u32,
125    }
126
127    #[derive(Serialize, Deserialize)]
128    pub struct Coordinates {
129        pub x: u32,
130        pub y: u32,
131    }
132}
133
134#[derive(Debug, PartialEq, Eq)]
135pub enum ContentType {
136    Book,
137    Magazine,
138    Newspaper,
139}
140
141#[derive(Debug, PartialEq, Eq)]
142pub enum DownloadStatus {
143    Skipped,
144    Complete(BookMetadata),
145}
146
147impl BookMetadata {
148    const SUFFIX_PAGES: &'static str = " pages";
149    const PREFIX_PUBLISHER: &'static str = "Published by ";
150    const PREFIX_ISSN: &'static str = "ISSN ";
151
152    const LABEL_TITLE: &'static str = "Title";
153    const LABEL_AUTHOR: &'static str = "Author";
154    const LABEL_PUBLISHER: &'static str = "Publisher";
155    const LABEL_ORIG_FROM: &'static str = "Original from";
156    const LABEL_DIGITIZED: &'static str = "Digitized";
157    const LABEL_LENGTH: &'static str = "Length";
158    const LABEL_ISBN: &'static str = "ISBN";
159
160    /// Gets the shortest title identifying this book.
161    pub fn get_title(&self) -> &str {
162        match self.book_type {
163            ContentType::Magazine | ContentType::Newspaper => &self.publish_date,
164            ContentType::Book => &self.title,
165        }
166    }
167
168    /// Gets the full title of this book, including the series name if it is a magazine issue.
169    pub fn get_full_title(&self) -> String {
170        match self.book_type {
171            ContentType::Magazine | ContentType::Newspaper => {
172                std::format!("{} - {}", &self.title, &self.publish_date)
173            }
174            ContentType::Book => self.title.to_string(),
175        }
176    }
177
178    fn parse_length(text: &str) -> io::Result<u32> {
179        Ok(Self::remove_and_extract(text, Self::SUFFIX_PAGES)
180            .parse::<u32>()
181            .to_result()?)
182    }
183
184    fn remove_and_extract(source: &str, to_remove: &str) -> String {
185        source.replace(to_remove, "").trim().to_string()
186    }
187
188    /// Extracts metadata from webpage.
189    pub fn from_page(id: &str, doc: &Html) -> io::Result<BookMetadata> {
190        let element = doc
191            .select(&Selector::parse("#summary_content_table").to_result()?)
192            .next()
193            .to_result("Metadata could not be parsed.")?;
194
195        let mut title = match element
196            .select(&Selector::parse(".booktitle").to_result()?)
197            .next()
198            .and_then(|e| e.text().next())
199        {
200            Some(x) => x.to_string(),
201            _ => String::new(),
202        };
203
204        let description = match element
205            .select(&Selector::parse("#synopsistext").to_result()?)
206            .next()
207            .and_then(|e| e.text().next())
208        {
209            Some(x) => x.to_string(),
210            _ => String::new(),
211        };
212
213        let mut publish_date = String::new();
214        let mut volume = String::new();
215        let mut issn = String::new();
216        let mut publisher = String::new();
217        let mut author = String::new();
218        let mut length = 0;
219        let mut date_digitized = String::new();
220        let mut orig_from = String::new();
221        let mut isbn = Vec::<String>::new();
222
223        // Main metadata area
224        if let Some(e) = element
225            .select(&Selector::parse("#metadata").to_result()?)
226            .next()
227        {
228            let mut i: u32 = 0;
229            for child in e.text() {
230                if i == 0 {
231                    publish_date = child.to_string();
232                } else if child.starts_with(Self::PREFIX_PUBLISHER) {
233                    publisher = Self::remove_and_extract(child, Self::PREFIX_PUBLISHER);
234                } else if child.starts_with(Self::PREFIX_ISSN) {
235                    issn = Self::remove_and_extract(child, Self::PREFIX_ISSN);
236                } else if child.ends_with(Self::SUFFIX_PAGES) {
237                    length = Self::parse_length(child)?;
238                } else {
239                    volume = child.to_string();
240                }
241
242                i += 1;
243            }
244        };
245
246        // Bibliography area - used specifically by books?
247        for tr in doc.select(&Selector::parse(".metadata_row").to_result()?) {
248            if let Some(label) = tr
249                .select(&Selector::parse(".metadata_label").to_result()?)
250                .next()
251                .and_then(|e| e.text().next())
252            {
253                if let Some(value) = tr
254                    .select(&Selector::parse(".metadata_value span").to_result()?)
255                    .next()
256                    .and_then(|e| e.text().next())
257                {
258                    match label {
259                        Self::LABEL_TITLE => {
260                            title = value.to_string();
261                        }
262                        Self::LABEL_AUTHOR => {
263                            author = value.to_string();
264                        }
265                        Self::LABEL_PUBLISHER => {
266                            publisher = value.to_string();
267                        }
268                        Self::LABEL_ORIG_FROM => {
269                            orig_from = value.to_string();
270                        }
271                        Self::LABEL_DIGITIZED => {
272                            date_digitized = value.to_string();
273                        }
274                        Self::LABEL_ISBN => {
275                            value
276                                .split(",")
277                                .for_each(|x| isbn.push(x.trim().to_string()));
278                        }
279                        Self::LABEL_LENGTH => {
280                            length = Self::parse_length(value)?;
281                        }
282                        _ => (),
283                    }
284                }
285            }
286        }
287
288        // Determine content type from text in preview link
289        let book_type = match doc
290            .select(&Selector::parse("#preview-link span").to_result()?)
291            .next()
292            .and_then(|e| e.text().next())
293        {
294            Some(x) => {
295                if x.contains("magazine") {
296                    ContentType::Magazine
297                } else if x.contains("newspaper") {
298                    ContentType::Newspaper
299                } else {
300                    ContentType::Book
301                }
302            }
303            _ => ContentType::Book,
304        };
305
306        Ok(BookMetadata {
307            id: id.to_string(),
308            title,
309            publish_date,
310            volume,
311            issn,
312            publisher,
313            description,
314            book_type,
315            author,
316            length,
317            date_digitized,
318            orig_from,
319        })
320    }
321}
gbscraper/scraper/types.rs

gbscraper/scraper/
types.rs