use bitflags::bitflags;
use scraper::{ElementRef, Html, Selector};
use serde::{Deserialize, Serialize};
use std::collections::{HashMap, HashSet, VecDeque};
use std::fs::OpenOptions;
use std::io::{self};
use std::io::{Read, Write};
use url::Url;
use crate::cbz::create_cbz;
use crate::pdf::{create_pdf_with_toc, TableOfContents};
pub struct ScraperOptions {
pub keep_images: bool,
pub formats: FormatFlags,
pub already_downloaded: HashSet<String>,
pub archive_file: Option<String>,
}
impl Default for ScraperOptions {
fn default() -> Self {
Self {
keep_images: false,
formats: FormatFlags::Pdf,
already_downloaded: HashSet::new(),
archive_file: None,
}
}
}
bitflags! {
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct FormatFlags:u32 {
const None = 0b000;
const Pdf = 0b001;
const Cbz = 0b010;
const All = 0b011;
}
}
pub struct IssueMetadata {
pub id: String,
pub series_name: String,
pub publish_date: String,
pub volume: String,
pub issn: String,
pub publisher: String,
pub description: String,
}
#[derive(Serialize, Deserialize)]
struct PageJson {
pid: String,
src: Option<String>,
}
#[derive(Serialize, Deserialize)]
struct IssueJson {
page: Vec<PageJson>,
}
impl IssueMetadata {
fn new(id: &str) -> IssueMetadata {
IssueMetadata {
id: id.to_string(),
series_name: String::new(),
publish_date: String::new(),
volume: String::new(),
issn: String::new(),
publisher: String::new(),
description: String::new(),
}
}
fn parse(&mut self, element: &ElementRef) {
let selector = Selector::parse(".booktitle").unwrap();
for e in element.select(&selector) {
for child in e.text() {
self.series_name = child.to_string();
break;
}
break;
}
let selector = Selector::parse("#synopsistext").unwrap();
for e in element.select(&selector) {
for child in e.text() {
self.description = child.to_string();
break;
}
break;
}
let selector = Selector::parse("#metadata").unwrap();
for e in element.select(&selector) {
let mut i: u32 = 0;
for child in e.text() {
match i {
0 => {
self.publish_date = child.to_string();
}
2 => {
self.volume = child.to_string();
}
3 => {
self.issn = child.to_string();
}
4 => {
self.publisher = child.to_string();
}
_ => (),
}
i += 1;
}
break;
}
}
}
pub fn download_issue(url: &str, dest: &str, options: &mut ScraperOptions) -> io::Result<()> {
let mut url_obj = Url::try_from(url).unwrap();
let mut id = String::new();
for (key, val) in url_obj.query_pairs() {
if key == "id" {
id = val.to_string();
break;
}
}
if options.already_downloaded.contains(&id) {
println!("Skipping already downloaded book: {id}...");
return Ok(());
}
println!("Identifying book: {id}...");
let mut res = reqwest::blocking::get(url).unwrap();
let mut body = String::new();
res.read_to_string(&mut body)?;
let doc = Html::parse_document(&body);
let mut issue_meta = IssueMetadata::new(&id);
let selector = Selector::parse("#summary_content_table").unwrap();
for element in doc.select(&selector) {
issue_meta.parse(&element);
break;
}
let issue_combined_id = std::format!("{0} [{1}]", issue_meta.publish_date, issue_meta.id);
let series_dir = std::format!("{dest}/{0}", issue_meta.series_name);
let issue_pics_dir = std::format!("{series_dir}/{issue_combined_id}");
println!("Found: {0} - {issue_combined_id}", issue_meta.series_name);
let exists_already = std::path::Path::new(&issue_pics_dir).exists();
if !exists_already {
std::fs::create_dir_all(&issue_pics_dir)?
};
let mut formats = options.formats.clone();
let filename_pdf = std::format!("{series_dir}/{issue_combined_id}.pdf");
if std::path::Path::new(&filename_pdf).exists() {
formats.remove(FormatFlags::Pdf)
}
let filename_cbz = std::format!("{series_dir}/{issue_combined_id}.cbz");
if std::path::Path::new(&filename_cbz).exists() {
formats.remove(FormatFlags::Cbz)
}
if formats == FormatFlags::None && (exists_already || !options.keep_images) {
println!("Already downloaded. Skipping...");
return Ok(());
}
let mut toc_page_title_lookup = HashMap::<String, String>::new();
let selector = Selector::parse("#toc tr").unwrap();
let mut i = 0;
let mut bookmark_text = String::new();
let mut bookmark_page_id = String::new();
for element in doc.select(&selector) {
if i % 2 == 0 {
for span in element.select(&Selector::parse("span").unwrap()) {
for str in span.text() {
bookmark_text += str;
}
break;
}
for link in element.select(&Selector::parse("a").unwrap()) {
if let Some(href) = link.attr("href") {
let link_url_obj = Url::try_from(href).unwrap();
for (key, val) in link_url_obj.query_pairs() {
if key == "pg" {
bookmark_page_id = val.to_string();
break;
}
}
}
break;
}
} else {
toc_page_title_lookup.insert(bookmark_page_id, bookmark_text);
bookmark_page_id = String::new();
bookmark_text = String::new();
}
i += 1;
}
let json_query_str = std::format!("id={id}&lpg=1&pg=1&jscmd=click3");
url_obj.set_query(Some(&json_query_str));
let mut res = reqwest::blocking::get(url_obj.to_string()).unwrap();
let mut body = String::new();
res.read_to_string(&mut body)?;
let issue: IssueJson = serde_json::from_str(&body).unwrap();
let mut page_number_lookup = HashMap::<String, usize>::new();
let mut pages_to_download = VecDeque::<String>::new();
let mut first_page = "1".to_string();
let mut i = 1;
for page in issue.page {
if let None = page.src {
page_number_lookup.insert(page.pid.clone(), i);
pages_to_download.push_back(page.pid.clone());
if i == 1 {
first_page = page.pid;
}
i += 1;
}
}
let mut toc = TableOfContents::new();
let mut pages_downloaded = HashSet::<String>::new();
while !pages_to_download.is_empty() {
let page_id = pages_to_download.pop_front().unwrap();
if pages_downloaded.contains(&page_id) {
continue;
}
let json_query_str = std::format!("id={id}&lpg={first_page}&pg={page_id}&jscmd=click3");
url_obj.set_query(Some(&json_query_str));
let mut res = reqwest::blocking::get(url_obj.to_string()).unwrap();
let mut body = String::new();
res.read_to_string(&mut body)?;
let issue: IssueJson = serde_json::from_str(&body).unwrap();
for page in issue.page {
if let Some(src) = page.src {
let mut res = reqwest::blocking::get(src + "&w=10000").unwrap();
let mut ext = "jpg";
for (name, value) in res.headers() {
if name.as_str() == "content-type" {
ext = value.to_str().unwrap();
let mut start = 0;
if let Some(x) = ext.find("/") {
start = x + 1
}
ext = &ext[start..];
if ext == "jpeg" {
ext = "jpg"
}
break;
}
}
let filename = std::format!(
"{0}-{1}.{2}",
std::format!("{:0>3}", page_number_lookup.get(&page.pid).unwrap()),
page.pid,
ext
);
if let Ok(mut file) =
std::fs::File::create_new(std::format!("{issue_pics_dir}/{filename}"))
{
res.copy_to(&mut file).unwrap();
}
if let Some(title) = toc_page_title_lookup.get(&page.pid) {
toc.add_page(title, &filename);
}
pages_downloaded.insert(page.pid);
}
}
}
if formats.contains(FormatFlags::Pdf) {
println!("Generating PDF...");
create_pdf_with_toc(&issue_pics_dir, &filename_pdf, &toc)?;
}
if formats.contains(FormatFlags::Cbz) {
println!("Generating CBZ...");
create_cbz(&issue_pics_dir, &filename_cbz)?;
}
if !(options.keep_images || exists_already) {
std::fs::remove_dir_all(&issue_pics_dir)?;
}
options.already_downloaded.insert(id.to_string());
if let Some(archive) = options.archive_file.as_ref() {
if let Ok(mut file) = OpenOptions::new().append(true).create(true).open(archive) {
if let Err(e) = file.write(std::format!("{id}\n").as_bytes()) {
eprintln!("Couldn't write to file: {}", e);
}
}
}
Ok(())
}
pub fn download_period(url: &str, dest: &str, options: &mut ScraperOptions) -> io::Result<()> {
for issue_url in get_issue_urls_in_period(url)? {
download_issue(&issue_url, dest, options)?;
}
Ok(())
}
pub fn download_all(url: &str, dest: &str, options: &mut ScraperOptions) -> io::Result<()> {
for period_url in get_period_urls(url)? {
download_period(&period_url, dest, options)?;
}
Ok(())
}
pub fn get_period_urls(url: &str) -> io::Result<Vec<String>> {
let mut ret = Vec::new();
let mut res = reqwest::blocking::get(url).unwrap();
let mut body = String::new();
res.read_to_string(&mut body)?;
let doc = Html::parse_document(&body);
let selector = Selector::parse("#period_selector a").unwrap();
for element in doc.select(&selector) {
if let Some(x) = element.attr("href") {
ret.push(if x.trim() == "" {
url.to_string()
} else {
x.to_string()
});
}
}
Ok(ret)
}
pub fn get_issue_urls_in_period(url: &str) -> io::Result<Vec<String>> {
let mut ret = Vec::new();
let mut res = reqwest::blocking::get(url).unwrap();
let mut body = String::new();
res.read_to_string(&mut body)?;
let doc = Html::parse_document(&body);
let selector = Selector::parse("div.allissues_gallerycell a:first-child").unwrap();
for element in doc.select(&selector) {
if let Some(x) = element.attr("href") {
ret.push(x.to_string());
}
}
Ok(ret)
}