use anyhow::Context;
use std::{
path::Path,
sync::{Arc, Mutex},
};
use scraper::{Html, Selector};
use tokio::fs;
use url::Url;
use crate::request;
#[derive(Clone, Debug)]
pub struct Everia {
url: Url,
out_folder: String,
task: u32,
retry: u32,
}
impl Everia {
pub fn parse_name(url: &str) -> String {
let mut url: Vec<&str> = url.split("/").collect();
let file_name = if let Some(fname) = url.pop()
&& !fname.is_empty()
{
fname
} else {
url.pop().unwrap()
};
urlencoding::decode(file_name).unwrap().to_string()
}
pub fn new(url: &str, out_folder: Option<String>) -> anyhow::Result<Self> {
let out_folder = match out_folder {
Some(_as) => _as,
None => Everia::parse_name(url),
};
println!("out_folder {}", out_folder);
Ok(Self {
url: Url::parse(url).context("innvalid url")?,
out_folder,
task: 8,
retry: 30,
})
}
pub fn collect_posts_link(&self, body: &str) -> anyhow::Result<Vec<String>> {
let document = Html::parse_document(body);
let selector = Selector::parse("#content").unwrap();
let selected = document.select(&selector);
let selector = Selector::parse("a").unwrap();
let mut links = Vec::new();
if let Some(document) = Iterator::last(selected) {
let selected = document.select(&selector);
for document in selected {
if document.attr("rel").is_none() {
continue;
}
if let Some(href) = document.attr("href") {
links.push(href.to_string());
}
}
}
Ok(links)
}
pub async fn collect_image_link_from_post(
&self,
post_url: &str,
) -> anyhow::Result<Vec<String>> {
let client = request::client()?;
let response = client.get(post_url).send().await?;
let text = response.text().await?;
let document = Html::parse_document(&text);
let selector = Selector::parse("div.entry-content, div.mainleft").unwrap();
let selected = document.select(&selector);
let mut image_links = Vec::new();
if let Some(document) = Iterator::last(selected) {
let selector = Selector::parse("img").unwrap();
let selected = document.select(&selector);
for sel in selected {
if let Some(link) = sel.attr("data-lazy-src") {
image_links.push(link.to_string());
} else if let Some(link) = sel.attr("data-original") {
image_links.push(link.to_string());
}
}
}
Ok(image_links)
}
pub async fn create_folder_from_url(&self, post_url: &str) -> Option<String> {
let mut folder_name: Vec<&str> = post_url.split("/").collect();
let folder_name = if let Some(name) = folder_name.pop()
&& !name.is_empty()
{
name
} else {
folder_name.pop().unwrap_or("None")
};
let folder_name = urlencoding::decode(folder_name).unwrap().to_string();
let path = format!("{}/{}/", self.out_folder, folder_name);
if Path::new(&path).exists() {
None
} else {
let _ = fs::create_dir_all(&path).await;
Some(path)
}
}
pub async fn download_posts(&self, post_url: &str) -> anyhow::Result<()> {
let out_folder = if let Some(out) = self.create_folder_from_url(post_url).await {
out
} else {
println!("skip {}", post_url);
return Ok(());
};
let image_links = self.collect_image_link_from_post(post_url).await?;
let mut tasks = Vec::new();
for img in image_links {
let out_folder = out_folder.clone();
let img = img.clone();
let mut retry = self.retry;
tasks.push(tokio::spawn(async move {
let img_name = img.split("/").last().unwrap();
'request: loop {
if let Ok(client) = request::client()
&& let Ok(response) = client.get(&img).send().await
&& response.status() == 200
&& let Ok(content) = response.bytes().await
{
let _ = fs::write(format!("{}/{}", out_folder, img_name), content).await;
} else if retry > 0 {
retry -= 1;
continue 'request;
}
break 'request;
}
}));
}
for task in tasks {
let _ = task.await;
}
Ok(())
}
pub async fn collect_posts_per_page(&self, url: &str) -> anyhow::Result<Vec<String>> {
let client = request::client()?;
let response = client.get(url).send().await?;
if !response.status().is_redirection() {
let post_links = self.collect_posts_link(response.text().await?.as_str())?;
Ok(post_links)
} else {
Err(anyhow::anyhow!("page is invalid"))
}
}
pub async fn collect_posts(&self) -> Vec<String> {
let mut page = 0;
let mut all_post = Vec::new();
loop {
page += 1;
let url = if page != 1 {
format!("{}page/{}/", self.url, page)
} else {
self.url.clone().to_string()
};
print!("fetching {} ", url);
match self.collect_posts_per_page(&url).await {
Ok(mut posts) => {
println!("- PASS");
all_post.append(&mut posts);
}
Err(_) => break,
}
}
println!("- Done");
all_post
}
pub async fn download(&self) {
let posts = if self.url.as_str().contains("tag") {
self.collect_posts().await
} else {
vec![self.url.clone().to_string()]
};
let posts = Arc::new(Mutex::new(posts));
let mut threads = Vec::new();
for _ in 0..self.task {
let posts = posts.clone();
let self_instance = self.clone();
threads.push(tokio::task::spawn(async move {
loop {
let (post, post_len) = {
if let Ok(mut posts) = posts.lock() {
(posts.pop(), posts.len())
} else {
continue;
}
};
if let Some(post) = post {
println!("[{}] downloading {}", post_len, post);
let _ = self_instance.download_posts(&post).await;
println!("[{}] - Done", post_len);
} else {
break;
}
}
}));
}
for th in threads {
let _ = th.await;
}
}
}