use bitflags::bitflags;
use scraper::selectable::Selectable;
use scraper::{Html, Selector};
use std::collections::HashSet;
use std::io::{self};
use super::helpers::*;
pub use json_api::IssueJson;
pub use json_api::PageJson;
pub struct ScraperOptions {
pub keep_images: bool,
pub formats: FormatFlags,
pub already_downloaded: HashSet<String>,
pub archive_file: Option<String>,
pub skip_download: bool,
}
impl Default for ScraperOptions {
fn default() -> Self {
Self {
keep_images: false,
formats: FormatFlags::Pdf,
already_downloaded: HashSet::new(),
archive_file: None,
skip_download: false,
}
}
}
bitflags! {
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct FormatFlags:u32 {
const None = 0b000;
const Pdf = 0b001;
const Cbz = 0b010;
const All = 0b011;
}
}
#[derive(Debug, PartialEq, Eq)]
pub struct BookMetadata {
pub id: String,
pub title: String,
pub publish_date: String,
pub volume: String,
pub issn: String,
pub publisher: String,
pub description: String,
pub book_type: ContentType,
pub author: String,
pub length: u32,
pub date_digitized: String,
pub orig_from: String,
}
mod json_api {
use serde::{Deserialize, Serialize};
#[derive(Serialize, Deserialize)]
pub struct IssueJson {
pub page: Vec<PageJson>,
}
#[derive(Serialize, Deserialize)]
pub struct PageJson {
pub pid: String,
pub src: Option<String>,
pub additional_info: Option<PageAdditionalInfo>,
}
#[derive(Serialize, Deserialize)]
pub struct PageAdditionalInfo {
#[serde(rename(deserialize = "[NewspaperJSONPageInfo]"))]
pub newspaper_json_page_info: Option<NewspaperJsonPageInfo>,
}
#[derive(Serialize, Deserialize)]
pub struct NewspaperJsonPageInfo {
#[serde(rename(deserialize = "tileres"))]
pub tile_res: Vec<TileRes>,
pub page_scanjob_coordinates: Coordinates,
}
#[derive(Serialize, Deserialize)]
pub struct TileRes {
#[serde(rename(deserialize = "h"))]
pub height: u32,
#[serde(rename(deserialize = "w"))]
pub width: u32,
#[serde(rename(deserialize = "z"))]
pub zoom: u32,
}
#[derive(Serialize, Deserialize)]
pub struct Coordinates {
pub x: u32,
pub y: u32,
}
}
#[derive(Debug, PartialEq, Eq)]
pub enum ContentType {
Book,
Magazine,
Newspaper,
}
#[derive(Debug, PartialEq, Eq)]
pub enum DownloadStatus {
Skipped,
Complete(BookMetadata),
}
impl BookMetadata {
const SUFFIX_PAGES: &'static str = " pages";
const PREFIX_PUBLISHER: &'static str = "Published by ";
const PREFIX_ISSN: &'static str = "ISSN ";
const LABEL_TITLE: &'static str = "Title";
const LABEL_AUTHOR: &'static str = "Author";
const LABEL_PUBLISHER: &'static str = "Publisher";
const LABEL_ORIG_FROM: &'static str = "Original from";
const LABEL_DIGITIZED: &'static str = "Digitized";
const LABEL_LENGTH: &'static str = "Length";
const LABEL_ISBN: &'static str = "ISBN";
pub fn get_title(&self) -> &str {
match self.book_type {
ContentType::Magazine | ContentType::Newspaper => &self.publish_date,
ContentType::Book => &self.title,
}
}
pub fn get_full_title(&self) -> String {
match self.book_type {
ContentType::Magazine | ContentType::Newspaper => {
std::format!("{} - {}", &self.title, &self.publish_date)
}
ContentType::Book => self.title.to_string(),
}
}
fn parse_length(text: &str) -> io::Result<u32> {
Ok(Self::remove_and_extract(text, Self::SUFFIX_PAGES)
.parse::<u32>()
.to_result()?)
}
fn remove_and_extract(source: &str, to_remove: &str) -> String {
source.replace(to_remove, "").trim().to_string()
}
pub fn from_page(id: &str, doc: &Html) -> io::Result<BookMetadata> {
let element = doc
.select(&Selector::parse("#summary_content_table").to_result()?)
.next()
.to_result("Metadata could not be parsed.")?;
let mut title = match element
.select(&Selector::parse(".booktitle").to_result()?)
.next()
.and_then(|e| e.text().next())
{
Some(x) => x.to_string(),
_ => String::new(),
};
let description = match element
.select(&Selector::parse("#synopsistext").to_result()?)
.next()
.and_then(|e| e.text().next())
{
Some(x) => x.to_string(),
_ => String::new(),
};
let mut publish_date = String::new();
let mut volume = String::new();
let mut issn = String::new();
let mut publisher = String::new();
let mut author = String::new();
let mut length = 0;
let mut date_digitized = String::new();
let mut orig_from = String::new();
let mut isbn = Vec::<String>::new();
if let Some(e) = element
.select(&Selector::parse("#metadata").to_result()?)
.next()
{
let mut i: u32 = 0;
for child in e.text() {
if i == 0 {
publish_date = child.to_string();
} else if child.starts_with(Self::PREFIX_PUBLISHER) {
publisher = Self::remove_and_extract(child, Self::PREFIX_PUBLISHER);
} else if child.starts_with(Self::PREFIX_ISSN) {
issn = Self::remove_and_extract(child, Self::PREFIX_ISSN);
} else if child.ends_with(Self::SUFFIX_PAGES) {
length = Self::parse_length(child)?;
} else {
volume = child.to_string();
}
i += 1;
}
};
for tr in doc.select(&Selector::parse(".metadata_row").to_result()?) {
if let Some(label) = tr
.select(&Selector::parse(".metadata_label").to_result()?)
.next()
.and_then(|e| e.text().next())
{
if let Some(value) = tr
.select(&Selector::parse(".metadata_value span").to_result()?)
.next()
.and_then(|e| e.text().next())
{
match label {
Self::LABEL_TITLE => {
title = value.to_string();
}
Self::LABEL_AUTHOR => {
author = value.to_string();
}
Self::LABEL_PUBLISHER => {
publisher = value.to_string();
}
Self::LABEL_ORIG_FROM => {
orig_from = value.to_string();
}
Self::LABEL_DIGITIZED => {
date_digitized = value.to_string();
}
Self::LABEL_ISBN => {
value
.split(",")
.for_each(|x| isbn.push(x.trim().to_string()));
}
Self::LABEL_LENGTH => {
length = Self::parse_length(value)?;
}
_ => (),
}
}
}
}
let book_type = match doc
.select(&Selector::parse("#preview-link span").to_result()?)
.next()
.and_then(|e| e.text().next())
{
Some(x) => {
if x.contains("magazine") {
ContentType::Magazine
} else if x.contains("newspaper") {
ContentType::Newspaper
} else {
ContentType::Book
}
}
_ => ContentType::Book,
};
Ok(BookMetadata {
id: id.to_string(),
title,
publish_date,
volume,
issn,
publisher,
description,
book_type,
author,
length,
date_digitized,
orig_from,
})
}
}