use bitflags::bitflags;
use scraper::selectable::Selectable;
use scraper::{Html, Selector};
use serde::{Deserialize, Serialize};
use std::collections::{HashMap, HashSet, VecDeque};
use std::fmt::Display;
use std::fs::OpenOptions;
use std::io::{self};
use std::io::{Read, Write};
use url::Url;
use crate::cbz::create_cbz;
use crate::pdf::{create_pdf_with_toc, TableOfContents};
pub struct ScraperOptions {
pub keep_images: bool,
pub formats: FormatFlags,
pub already_downloaded: HashSet<String>,
pub archive_file: Option<String>,
pub skip_download: bool,
}
impl Default for ScraperOptions {
fn default() -> Self {
Self {
keep_images: false,
formats: FormatFlags::Pdf,
already_downloaded: HashSet::new(),
archive_file: None,
skip_download: false,
}
}
}
bitflags! {
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct FormatFlags:u32 {
const None = 0b000;
const Pdf = 0b001;
const Cbz = 0b010;
const All = 0b011;
}
}
#[derive(Debug, PartialEq, Eq)]
pub struct BookMetadata {
pub id: String,
pub series_name: String,
pub publish_date: String,
pub volume: String,
pub issn: String,
pub publisher: String,
pub description: String,
pub book_type: BookType,
pub author: String,
pub length: u32,
pub date_digitized: String,
pub orig_from: String,
}
#[derive(Serialize, Deserialize)]
struct PageJson {
pid: String,
src: Option<String>,
}
#[derive(Serialize, Deserialize)]
struct IssueJson {
page: Vec<PageJson>,
}
#[derive(Debug, PartialEq, Eq)]
pub enum BookType {
Book,
Issue,
}
trait ToResult<T> {
fn to_result(self) -> std::io::Result<T>;
}
impl<T, E: Display> ToResult<T> for std::result::Result<T, E> {
fn to_result(self) -> std::io::Result<T> {
match self {
Ok(x) => Ok(x),
Err(x) => Err(std::io::Error::new(io::ErrorKind::Other, x.to_string())),
}
}
}
trait ToResultErrorMessage<T> {
fn to_result(self, msg: &str) -> std::io::Result<T>;
}
impl<T> ToResultErrorMessage<T> for Option<T> {
fn to_result(self, msg: &str) -> std::io::Result<T> {
match self {
Some(x) => Ok(x),
None => Err(std::io::Error::new(io::ErrorKind::Other, msg)),
}
}
}
impl BookMetadata {
pub fn get_title(&self) -> &str {
match self.book_type {
BookType::Issue => &self.publish_date,
BookType::Book => &self.series_name,
}
}
pub fn get_full_title(&self) -> String {
match self.book_type {
BookType::Issue => std::format!("{} - {}", &self.series_name, &self.publish_date),
BookType::Book => self.series_name.to_string(),
}
}
fn parse_length(text: &str) -> io::Result<u32> {
Ok(text
.replace(" pages", "")
.trim()
.parse::<u32>()
.to_result()?)
}
pub fn from_page(id: &str, doc: &Html) -> io::Result<BookMetadata> {
let element = doc
.select(&Selector::parse("#summary_content_table").to_result()?)
.next()
.to_result("Metadata could not be parsed.")?;
let series_name = match element
.select(&Selector::parse(".booktitle").to_result()?)
.next()
.and_then(|e| e.text().next())
{
Some(x) => x.to_string(),
_ => String::new(),
};
let description = match element
.select(&Selector::parse("#synopsistext").to_result()?)
.next()
.and_then(|e| e.text().next())
{
Some(x) => x.to_string(),
_ => String::new(),
};
let mut publish_date = String::new();
let mut volume = String::new();
let mut issn = String::new();
let mut publisher = String::new();
let mut author = String::new();
let mut length = 0;
let mut date_digitized = String::new();
let mut orig_from = String::new();
if let Some(e) = element
.select(&Selector::parse("#metadata").to_result()?)
.next()
{
let mut i: u32 = 0;
for child in e.text() {
match i {
0 => {
publish_date = child.to_string();
}
1 => {
length = Self::parse_length(child)?;
}
2 => {
volume = child.to_string();
}
3 => {
issn = child.to_string();
}
4 => {
publisher = child.to_string();
}
_ => (),
}
i += 1;
}
};
for tr in doc.select(&Selector::parse(".metadata_row").to_result()?) {
if let Some(label) = tr
.select(&Selector::parse(".metadata_label").to_result()?)
.next()
.and_then(|e| e.text().next())
{
if let Some(value) = tr
.select(&Selector::parse(".metadata_value span").to_result()?)
.next()
.and_then(|e| e.text().next())
{
match label {
"Author" => {
author = value.to_string();
}
"Publisher" => {
publisher = value.to_string();
}
"Original from" => {
orig_from = value.to_string();
}
"Digitized" => {
date_digitized = value.to_string();
}
"Length" => {
length = Self::parse_length(value)?;
}
_ => (),
}
}
}
}
let book_type = match doc
.select(&Selector::parse("#preview-link span").to_result()?)
.next()
.and_then(|e| e.text().next())
{
Some(x) => {
if x.contains("magazine") {
BookType::Issue
} else {
BookType::Book
}
}
_ => BookType::Book,
};
Ok(BookMetadata {
id: id.to_string(),
series_name,
publish_date,
volume,
issn,
publisher,
description,
book_type,
author,
length,
date_digitized,
orig_from,
})
}
}
fn id_from_url(url: &str) -> io::Result<String> {
let url_obj = Url::try_from(url).to_result()?;
const INVALID_URL: &str = "Invalid URL";
Ok(match url_obj.query_pairs().find(|x| x.0 == "id") {
Some(x) => x.1.to_string(),
None => url_obj
.path_segments()
.to_result(INVALID_URL)?
.last()
.to_result(INVALID_URL)?
.to_string(),
})
}
fn url_from_id(id: &str) -> String {
std::format!("https://books.google.com/books?id={id}")
}
fn get_json_url(id: &str, first_page: &str, page_id: &str) -> String {
std::format!(
"{}&lpg={first_page}&pg={page_id}&jscmd=click3",
url_from_id(id)
)
}
#[derive(Debug, PartialEq, Eq)]
pub enum DownloadStatus {
Skipped,
Complete(BookMetadata),
}
pub fn download_issue(
url: &str,
dest: &str,
options: &mut ScraperOptions,
) -> io::Result<DownloadStatus> {
let id = id_from_url(url)?;
let url = url_from_id(&id);
if options.already_downloaded.contains(&id) {
println!("Skipping already downloaded book: {id}...");
return Ok(DownloadStatus::Skipped);
}
println!("Identifying book: {id}...");
let res = reqwest::blocking::get(url).to_result()?;
let body = res.text().to_result()?;
let doc = Html::parse_document(&body);
let meta = BookMetadata::from_page(&id, &doc)?;
let issue_combined_id = std::format!("{0} [{1}]", meta.get_full_title(), meta.id);
let dest = match meta.book_type {
BookType::Issue => std::format!("{dest}/{0}", meta.series_name),
BookType::Book => dest.to_string(),
};
let issue_pics_dir = std::format!("{dest}/{issue_combined_id}");
let filename_pdf = std::format!("{dest}/{issue_combined_id}.pdf");
let filename_cbz = std::format!("{dest}/{issue_combined_id}.cbz");
println!("Found: {}", meta.get_full_title());
let mut formats = options.formats.clone();
let exists_already = std::path::Path::new(&issue_pics_dir).exists();
if exists_already {
if std::path::Path::new(&filename_pdf).exists() {
formats.remove(FormatFlags::Pdf)
}
if std::path::Path::new(&filename_cbz).exists() {
formats.remove(FormatFlags::Cbz)
}
if formats == FormatFlags::None && (exists_already || !options.keep_images) {
println!("Already downloaded. Skipping...");
return Ok(DownloadStatus::Skipped);
}
}
let mut toc_page_title_lookup: HashMap<String, String> = HashMap::<String, String>::new();
let mut parse_msg_logged = false;
for element in doc.select(&Selector::parse("div.toc_entry").to_result()?) {
if !parse_msg_logged {
println!("Parsing table of contents...");
parse_msg_logged = true;
}
let mut bookmark_name = String::new();
element.text().for_each(|x| bookmark_name += x);
if let Some(bookmark_url) = element
.select(&Selector::parse("a").to_result()?)
.next()
.and_then(|x| x.attr("href"))
{
if let Some(x) = Url::try_from(bookmark_url)
.to_result()?
.query_pairs()
.find(|x| x.0 == "pg")
{
toc_page_title_lookup.insert(x.1.to_string(), bookmark_name);
}
}
}
let mut res = reqwest::blocking::get(get_json_url(&id, "1", "1")).to_result()?;
let mut body = String::new();
res.read_to_string(&mut body)?;
let issue: IssueJson = serde_json::from_str(&body).to_result()?;
let mut page_number_lookup = HashMap::<String, usize>::new();
let mut pages_to_download = VecDeque::<String>::new();
let mut first_page = "1".to_string();
let mut i_page = 1;
for page in issue.page {
if let None = page.src {
page_number_lookup.insert(page.pid.clone(), i_page);
pages_to_download.push_back(page.pid.clone());
if i_page == 1 {
first_page = page.pid;
}
i_page += 1;
}
}
if options.skip_download {
return Ok(DownloadStatus::Complete(meta));
}
if !exists_already {
std::fs::create_dir_all(&issue_pics_dir)?
}
println!("Downloading images...");
let mut toc = TableOfContents::new();
let mut pages_downloaded = HashSet::<String>::new();
while !pages_to_download.is_empty() {
let page_id = pages_to_download.pop_front().unwrap();
if pages_downloaded.contains(&page_id) {
continue;
}
let mut res =
reqwest::blocking::get(get_json_url(&id, &first_page, &page_id)).to_result()?;
let mut body = String::new();
res.read_to_string(&mut body)?;
let issue: IssueJson = serde_json::from_str(&body).to_result()?;
for page in issue.page {
if let Some(src) = page.src {
if pages_downloaded.contains(&page.pid) {
continue;
}
let mut res = reqwest::blocking::get(src + "&w=10000").to_result()?;
let mut ext = "jpg";
for (name, value) in res.headers() {
if name.as_str() == "content-type" {
ext = value.to_str().to_result()?;
let mut start = 0;
if let Some(x) = ext.find("/") {
start = x + 1
}
ext = &ext[start..];
if ext == "jpeg" {
ext = "jpg"
}
break;
}
}
let mut p = 0;
let page_number = page_number_lookup.get(&page.pid).unwrap_or_else(|| {
p = i_page;
i_page += 1;
&p
});
let filename = std::format!(
"{0}-{1}.{2}",
std::format!("{:0>5}", page_number),
page.pid,
ext
);
if let Ok(mut file) =
std::fs::File::create_new(std::format!("{issue_pics_dir}/{filename}"))
{
res.copy_to(&mut file).to_result()?;
}
if let Some(title) = toc_page_title_lookup.get(&page.pid) {
toc.add_page(title, &filename);
}
pages_downloaded.insert(page.pid);
}
}
}
if formats.contains(FormatFlags::Pdf) {
println!("Generating PDF...");
create_pdf_with_toc(&issue_pics_dir, &filename_pdf, &toc)?;
}
if formats.contains(FormatFlags::Cbz) {
println!("Generating CBZ...");
create_cbz(&issue_pics_dir, &filename_cbz)?;
}
if !(options.keep_images || exists_already) {
std::fs::remove_dir_all(&issue_pics_dir)?;
}
options.already_downloaded.insert(id.to_string());
if let Some(archive) = options.archive_file.as_ref() {
if let Ok(mut file) = OpenOptions::new().append(true).create(true).open(archive) {
if let Err(e) = file.write(std::format!("{id}\n").as_bytes()) {
eprintln!("Couldn't write to file: {}", e);
}
}
}
Ok(DownloadStatus::Complete(meta))
}
pub fn download_period(url: &str, dest: &str, options: &mut ScraperOptions) -> io::Result<()> {
for issue_url in get_issue_urls_in_period(url)? {
if let Err(x) = download_issue(&issue_url, dest, options) {
eprintln!("Error downloading issue {issue_url}: {}", x);
}
}
Ok(())
}
pub fn download_all(url: &str, dest: &str, options: &mut ScraperOptions) -> io::Result<()> {
for period_url in get_period_urls(url)? {
if let Err(x) = download_period(&period_url, dest, options) {
eprintln!("Error downloading period {period_url}: {}", x);
}
}
Ok(())
}
pub fn get_period_urls(url: &str) -> io::Result<Vec<String>> {
let mut ret = Vec::new();
let mut res = reqwest::blocking::get(url).to_result()?;
let mut body = String::new();
res.read_to_string(&mut body)?;
let doc = Html::parse_document(&body);
let selector = Selector::parse("#period_selector a").to_result()?;
for element in doc.select(&selector) {
if let Some(x) = element.attr("href") {
ret.push(if x.trim() == "" {
url.to_string()
} else {
x.to_string()
});
}
}
Ok(ret)
}
pub fn get_issue_urls_in_period(url: &str) -> io::Result<Vec<String>> {
let mut ret = Vec::new();
let mut res = reqwest::blocking::get(url).to_result()?;
let mut body = String::new();
res.read_to_string(&mut body)?;
let doc = Html::parse_document(&body);
let selector = Selector::parse("div.allissues_gallerycell a:first-child").to_result()?;
for element in doc.select(&selector) {
if let Some(x) = element.attr("href") {
ret.push(x.to_string());
}
}
Ok(ret)
}
#[cfg(test)]
mod tests {
use super::*;
const ID: &str = "FAKE_ID";
const ARGS: &str = "a=aa&b=bb&c=1";
#[test]
fn old_url_parsing() {
let url = std::format!("https://books.google.com/books?id={ID}&{ARGS}");
assert_eq!(id_from_url(&url).unwrap().as_str(), ID);
}
#[test]
fn new_url_parsing() {
let url = std::format!("https://www.google.com/books/edition/_/{ID}?{ARGS}");
assert_eq!(id_from_url(&url).unwrap().as_str(), ID);
}
#[test]
fn url_fixing() {
let url = url_from_id(ID);
let expected = std::format!("https://books.google.com/books?id={ID}");
assert_eq!(url, expected);
}
#[test]
fn metadata_parsing_book() {
let id = String::from("XV8XAAAAYAAJ");
let url = std::format!("https://books.google.com/books?id={id}");
let dest = ".";
let mut options = ScraperOptions::default();
options.skip_download = true;
let mut description = String::new();
description.push_str("A literary classic that wasn't recognized for its merits until decades after its publication, Herman Melville's Moby-Dick");
description.push_str(" tells the tale of a whaling ship and its crew, who are carried progressively further out to sea by the fiery Captain Ahab.");
description.push_str(" Obsessed with killing the massive whale, which had previously bitten off Ahab's leg, the seasoned seafarer steers his ship");
description.push_str(" to confront the creature, while the rest of the shipmates, including the young narrator, Ishmael, and the harpoon expert,");
description.push_str(" Queequeg, must contend with their increasingly dire journey. The book invariably lands on any short list of the greatest American novels.");
let expected = BookMetadata {
id,
series_name: String::from("Moby Dick"),
publish_date: String::from(""),
volume: String::from(""),
issn: String::from(""),
publisher: String::from("Dana Estes & Company, 1892"),
description,
book_type: BookType::Book,
author: String::from("Herman Melville"),
length: 545,
date_digitized: String::from("Mar 20, 2008"),
orig_from: String::from("Harvard University"),
};
let metadata = download_issue(&url, dest, &mut options);
assert_eq!(metadata.unwrap(), DownloadStatus::Complete(expected));
}
#[test]
fn magazine_metadata_parsing_magazine() {
let id = String::from("CFEEAAAAMBAJ");
let url = std::format!("https://books.google.com/books?id={id}");
let dest = ".";
let mut options = ScraperOptions::default();
options.skip_download = true;
let mut description = String::new();
description.push_str("LIFE Magazine is the treasured photographic magazine that chronicled the 20th Century. It now lives on at LIFE.com,");
description.push_str(" the largest, most amazing collection of professional photography on the internet. Users can browse, search and view");
description.push_str(" photos of today’s people and events. They have free access to share, print and post images for personal use.");
let expected = BookMetadata {
id,
series_name: String::from("LIFE"),
publish_date: String::from("Oct 3, 1969"),
volume: String::from("Vol. 67, No. 14"),
issn: String::from("ISSN 0024-3019"),
publisher: String::from("Published by Time Inc"),
description,
book_type: BookType::Issue,
author: String::from(""),
length: 94,
date_digitized: String::from(""),
orig_from: String::from(""),
};
let metadata = download_issue(&url, dest, &mut options);
assert_eq!(metadata.unwrap(), DownloadStatus::Complete(expected));
}
}