use self::image_data::ImageDataBase64;
use self::pair::Pair;
use self::request::ImageRequest;
use crate::download_progress::DownloadProgress;
use crate::error::ImageDownloadError;
use crate::util::Util;
use crate::{FullTextParser, constants};
use base64::Engine;
use futures::channel::mpsc::{self, Sender};
use futures::{SinkExt, StreamExt};
use image::ImageFormat;
use libxml::tree::{Node, SaveOptions};
use libxml::xpath::Context;
use reqwest::Client;
use std::io::Cursor;
mod image_data;
mod pair;
mod request;
pub struct ImageDownloader {
max_size: (u32, u32),
}
impl ImageDownloader {
pub fn new(max_size: (u32, u32)) -> Self {
ImageDownloader { max_size }
}
pub async fn single_from_url(
url: &str,
client: &Client,
mut progress: Option<Sender<DownloadProgress>>,
) -> Result<Vec<u8>, ImageDownloadError> {
let response = client.get(url).send().await?;
let content_type = Util::get_content_type(&response);
let content_length = Util::get_content_length(&response);
if content_type.is_err()
&& let Ok(content_length) = content_length
&& content_length > constants::UNKNOWN_CONTENT_SIZE_LIMIT
{
return Err(ImageDownloadError::ContentType);
}
let content_type = content_type?;
let content_length = content_length.unwrap_or(0);
if !content_type.contains("image") {
return Err(ImageDownloadError::ContentType);
}
let mut stream = response.bytes_stream();
let mut downloaded_bytes = 0;
let mut result = Vec::with_capacity(content_length);
while let Some(item) = stream.next().await {
let chunk = item?;
downloaded_bytes += chunk.len();
if let Some(sender) = progress.as_mut() {
_ = sender
.send(DownloadProgress {
total_size: content_length,
downloaded: downloaded_bytes,
})
.await;
}
for byte in chunk {
result.push(byte);
}
}
Ok(result)
}
pub async fn download_images_from_string(
&self,
html: &str,
client: &Client,
mut progress: Option<Sender<DownloadProgress>>,
) -> Result<String, ImageDownloadError> {
let image_urls = Self::harvest_image_urls_from_html(html)?;
let mut image_requests = Vec::new();
for image_url in image_urls {
let client = client.clone();
let future = async move {
let request = ImageRequest::new(image_url.value, &client).await;
let parent_request = if let Some(parent_url) = image_url.parent_value {
ImageRequest::new(parent_url, &client).await.ok()
} else {
None
};
if let Ok(request) = request {
Some(Pair {
value: request,
parent_value: parent_request,
})
} else {
None
}
};
image_requests.push(future);
}
let res = futures::future::join_all(image_requests)
.await
.into_iter()
.flatten()
.collect::<Vec<_>>();
let total_size = res
.iter()
.map(|req_pair| {
req_pair.value.content_length()
+ req_pair
.parent_value
.as_ref()
.map(|p| p.content_length())
.unwrap_or(0)
})
.sum::<usize>();
let (tx, mut rx) = mpsc::channel::<usize>(2);
let mut download_futures = Vec::new();
for request in res {
download_futures.push(self.download_image(request, tx.clone()));
}
tokio::spawn(async move {
let mut received = 0_usize;
while let Some(i) = rx.next().await {
received += i;
if let Some(progress) = progress.as_mut() {
_ = progress
.send(DownloadProgress {
total_size,
downloaded: received,
})
.await;
}
}
});
let downloaded_images = futures::future::join_all(download_futures)
.await
.into_iter()
.flatten()
.collect::<Vec<_>>();
Self::replace_downloaded_images(html, downloaded_images)
}
fn replace_downloaded_images(
html: &str,
downloaded_images: Vec<Pair<ImageDataBase64>>,
) -> Result<String, ImageDownloadError> {
let doc = FullTextParser::parse_html_string_patched(html)
.map_err(|_| ImageDownloadError::HtmlParse)?;
let xpath_ctx = Context::new(&doc).map_err(|()| {
tracing::error!("Failed to create xpath context for document");
ImageDownloadError::HtmlParse
})?;
for downloaded_image_pair in downloaded_images {
let xpath = format!("//img[@src='{}']", &downloaded_image_pair.value.url);
let node = Util::evaluate_xpath(&xpath_ctx, &xpath, false)
.expect("doesn't throw")
.into_iter()
.next();
if let Some(mut node) = node {
if node
.set_property("src", &downloaded_image_pair.value.data)
.is_err()
{
continue;
}
if let Some(parent_data) = downloaded_image_pair.parent_value {
_ = node.set_property("big-src", &parent_data.data)
}
}
}
let options = SaveOptions {
format: false,
no_declaration: false,
no_empty_tags: true,
no_xhtml: false,
xhtml: false,
as_xml: false,
as_html: true,
non_significant_whitespace: false,
};
Ok(doc.to_string_with_options(options))
}
fn harvest_image_urls_from_html(html: &str) -> Result<Vec<Pair<String>>, ImageDownloadError> {
let doc = FullTextParser::parse_html_string_patched(html)
.map_err(|_| ImageDownloadError::HtmlParse)?;
let xpath_ctx = Context::new(&doc).map_err(|()| {
tracing::error!("Failed to create xpath context for document");
ImageDownloadError::HtmlParse
})?;
let xpath = "//img";
let node_vec = Util::evaluate_xpath(&xpath_ctx, xpath, false)
.map_err(|_| ImageDownloadError::HtmlParse)?;
let mut image_urls = Vec::new();
for node in node_vec {
if let Ok(url) = Self::harvest_image_urls_from_node(node) {
image_urls.push(url);
}
}
Ok(image_urls)
}
fn harvest_image_urls_from_node(node: Node) -> Result<Pair<String>, ImageDownloadError> {
let src = match node.get_property("src") {
Some(src) => {
if src.starts_with("data:") {
return Err(ImageDownloadError::Unknown);
} else {
src
}
}
None => {
return Err(ImageDownloadError::Unknown);
}
};
let parent_url = Self::check_image_parent(&node).ok();
let image_url = Pair {
value: src,
parent_value: parent_url,
};
Ok(image_url)
}
async fn download_image(
&self,
request: Pair<ImageRequest>,
mut tx: Sender<usize>,
) -> Result<Pair<ImageDataBase64>, ImageDownloadError> {
let content_type = request.value.content_type().to_owned();
let scale_image = content_type != "image/svg+xml" && content_type != "image/gif";
let mut image = request.value.download(&mut tx).await?;
let mut parent_image = None;
if let Some(parent_request) = request.parent_value {
parent_image = parent_request.download(&mut tx).await.ok();
}
if scale_image && let Some(resized_image) = Self::scale_image(&image.data, self.max_size) {
if parent_image.is_none() {
parent_image = Some(image.clone());
}
image.data = resized_image;
}
let image_base64 = base64::engine::general_purpose::STANDARD.encode(&image.data);
let image_string = format!("data:{content_type};base64,{image_base64}");
let image_data_base64 = ImageDataBase64 {
url: image.url,
data: image_string,
};
let parent_image_data_base64 = if let Some(parent_image) = parent_image {
let parent_image_base64 =
base64::engine::general_purpose::STANDARD.encode(&parent_image.data);
let parent_image_string = format!(
"data:{};base64,{}",
parent_image.content_type, parent_image_base64
);
Some(ImageDataBase64 {
url: parent_image.url,
data: parent_image_string,
})
} else {
None
};
Ok(Pair {
value: image_data_base64,
parent_value: parent_image_data_base64,
})
}
fn scale_image(image_buffer: &[u8], max_dimensions: (u32, u32)) -> Option<Vec<u8>> {
let mut image = match image::load_from_memory(image_buffer) {
Err(error) => {
tracing::error!(%error, "Failed to open image to resize");
return None;
}
Ok(image) => image,
};
let dimensions = (image.width(), image.height());
if dimensions.0 > max_dimensions.0 || dimensions.1 > max_dimensions.1 {
image = image.resize(
max_dimensions.0,
max_dimensions.1,
image::imageops::FilterType::Lanczos3,
);
let mut resized_buf: Vec<u8> = Vec::new();
if let Err(error) = image.write_to(&mut Cursor::new(&mut resized_buf), ImageFormat::Png)
{
tracing::error!(%error, "Failed to save resized image");
return None;
}
Some(resized_buf)
} else {
None
}
}
fn check_image_parent(node: &Node) -> Result<String, ImageDownloadError> {
let parent = match node.get_parent() {
Some(parent) => parent,
None => {
tracing::debug!("No parent node");
return Err(ImageDownloadError::ParentDownload);
}
};
if parent.get_name().to_lowercase() != "a" {
tracing::debug!("parent is not an <a> node");
return Err(ImageDownloadError::ParentDownload);
}
let href = match parent.get_property("href") {
Some(href) => href,
None => {
tracing::debug!("Parent doesn't have href prop");
return Err(ImageDownloadError::ParentDownload);
}
};
Ok(href)
}
}
#[cfg(test)]
mod tests {
use super::*;
use reqwest::Client;
use std::fs;
#[tokio::test]
#[ignore = "downloads content from the web"]
async fn fedora31() {
let image_dowloader = ImageDownloader::new((2048, 2048));
let html = fs::read_to_string(r"./resources/tests/images/planet_gnome/source.html")
.expect("Failed to read HTML");
let result = image_dowloader
.download_images_from_string(&html, &Client::new(), None)
.await
.expect("Failed to downalod images");
let expected = fs::read_to_string(r"./resources/tests/images/planet_gnome/expected.html")
.expect("Failed to create output file");
assert_eq!(expected, result);
}
}