use super::{
chapter,
page,
series,
};
use crate::{
spiders::HttpClient,
Chapter,
Error,
Filter,
Options,
Page,
Result,
Series,
};
use cookie_store::CookieStore;
use std::path::PathBuf;
use url::Url;
pub(crate) struct Site {
spider: HttpClient,
output: PathBuf,
}
impl Site {
#[allow(clippy::expect_used)] pub(crate) fn new(options: Options) -> Self {
let mut store = CookieStore::default();
store
.insert_raw(
&ureq::Cookie::new("pagGDPR", "true"),
&Url::parse("https://www.webtoons.com/").expect("valid URL"),
)
.expect("pagGDPR cookie");
Self {
spider: HttpClient::new(options.delay, options.retry, Some(store)),
output: options.output,
}
}
}
impl crate::Site for Site {
fn get_series(&self, url: &Url) -> Result<Series> {
log::info!("scraping series info from {}…", url.as_str());
let html = self.spider.get_html(url)?;
let series = series::scrape_from_html(&html).map_err(|err| {
Error::Scraping(format!(
"failed to scrape serie from {}: {}",
url.as_str(),
err
))
})?;
log::debug!(
"scraped info for series `{}`: {} chapters, {} per page",
series.title,
series.pagination.chapter_count,
series.pagination.page_size
);
Ok(series)
}
fn get_chapters<'a>(
&self,
series: &'a Series,
filter: Filter,
) -> Result<Vec<Chapter<'a>>> {
log::info!("scraping chapter links for series {}…", series.title);
let first_page = series.pagination.get_page(*filter.range.end());
let last_page = series.pagination.get_page(*filter.range.start());
let mut chapters = Vec::new();
for page in first_page..=last_page {
let mut url = series.url.clone();
url.query_pairs_mut().append_pair("page", &page.to_string());
log::info!("extracting chapter from page {}…", page);
let html = self.spider.get_html(&url)?;
chapters.extend(
chapter::scrape_from_html(&html, &series).map_err(|err| {
Error::Scraping(format!(
"failed to scrape chapters from {}: {}",
url.as_str(),
err
))
})?,
);
}
log::debug!("found {} chapters", chapters.len());
#[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
let mut chapters = chapters
.into_iter()
.filter(|chapter| filter.range.contains(&(chapter.id as u16)))
.collect::<Vec<_>>();
#[allow(clippy::expect_used)] chapters
.sort_by(|a, b| a.partial_cmp(b).expect("abnormal float as ID"));
log::debug!("selected {} chapters", chapters.len());
Ok(chapters)
}
fn get_pages<'a>(&self, chapter: &'a Chapter<'_>) -> Result<Vec<Page<'a>>> {
log::info!("scraping page links for chapter {}…", chapter.id);
let html = self.spider.get_html(&chapter.url)?;
let pages = page::scrape_from_html(&html, chapter).map_err(|err| {
Error::Scraping(format!(
"failed to pages from {}: {}",
chapter.url.as_str(),
err
))
})?;
log::debug!("found {} pages in chapter {}", pages.len(), chapter.id);
Ok(pages)
}
fn mkdir(&self, chapters: &[Chapter<'_>]) -> Result<()> {
for chapter in chapters {
let path = chapter.path(&self.output);
crate::fs::mkdir_p(&path)?;
}
Ok(())
}
fn download(&self, pages: &[Page<'_>]) -> Result<()> {
log::info!(
"downloading {} pages for chapter {}…",
pages.len(),
pages[0].chapter.id,
);
let mut bytes: Vec<u8> = Vec::new();
for page in pages.iter() {
let path = page.path(&self.output);
if path.exists() {
log::debug!("{} already exists, skip", path.display());
continue;
}
log::info!("downloading {}…", path.display());
self.spider
.get_image(&page.main, &page.chapter.url, &mut bytes)?;
crate::fs::atomic_save(&path, &bytes)?;
bytes.clear();
}
Ok(())
}
}