use std::collections::{HashMap, HashSet};
use std::io::Result;
use std::sync::mpsc;
use std::thread;
use pulldown_cmark::{Event, Tag};
use tracing::{event, Level};
use url::Url;
use super::curl::CurlResourceHandler;
use super::{MimeData, ResourceUrlHandler};
use crate::references::UrlBase;
use crate::Environment;
const MAX_PARALLEL_FETCHES: usize = 8;
#[must_use]
pub fn scan_remote_image_urls(events: &[Event<'_>], env: &Environment) -> Vec<Url> {
let mut seen = HashSet::new();
let mut urls = Vec::new();
for event in events {
let Event::Start(Tag::Image { dest_url, .. }) = event else {
continue;
};
let Some(resolved) = env.resolve_reference(dest_url) else {
continue;
};
if !matches!(resolved.scheme(), "http" | "https") {
continue;
}
if seen.insert(resolved.clone()) {
urls.push(resolved);
}
}
urls
}
#[must_use]
pub fn prefetch_remote(
urls: Vec<Url>,
user_agent: &'static str,
read_limit: u64,
) -> HashMap<Url, MimeData> {
if urls.is_empty() {
return HashMap::new();
}
let (tx, rx) = mpsc::channel::<(Url, Result<MimeData>)>();
let mut handles = Vec::with_capacity(urls.len().min(MAX_PARALLEL_FETCHES));
for chunk in urls.chunks(MAX_PARALLEL_FETCHES.max(1)) {
for url in chunk {
let tx = tx.clone();
let url = url.clone();
handles.push(thread::spawn(move || {
let result = CurlResourceHandler::create(read_limit, user_agent)
.and_then(|h| h.read_resource(&url));
let _ = tx.send((url, result));
}));
if handles.len() >= MAX_PARALLEL_FETCHES {
break;
}
}
}
drop(tx);
let mut cache = HashMap::new();
for (url, result) in rx {
match result {
Ok(data) => {
cache.insert(url, data);
}
Err(err) => {
event!(Level::DEBUG, %url, %err, "prefetch failed, falling through");
}
}
}
for handle in handles {
let _ = handle.join();
}
cache
}
pub struct CachingResourceHandler<H: ResourceUrlHandler> {
cache: HashMap<Url, MimeData>,
inner: H,
}
impl<H: ResourceUrlHandler> CachingResourceHandler<H> {
pub fn new(cache: HashMap<Url, MimeData>, inner: H) -> Self {
Self { cache, inner }
}
pub fn passthrough(inner: H) -> Self {
Self::new(HashMap::new(), inner)
}
}
impl<H: ResourceUrlHandler> ResourceUrlHandler for CachingResourceHandler<H> {
fn read_resource(&self, url: &Url) -> Result<MimeData> {
if let Some(data) = self.cache.get(url) {
return Ok(data.clone());
}
self.inner.read_resource(url)
}
}
pub fn prefetch_and_wrap<H: ResourceUrlHandler>(
events: &[Event<'_>],
env: &Environment,
user_agent: &'static str,
read_limit: u64,
inner: H,
) -> CachingResourceHandler<H> {
let urls = scan_remote_image_urls(events, env);
if urls.is_empty() {
return CachingResourceHandler::passthrough(inner);
}
event!(
Level::DEBUG,
count = urls.len(),
"prefetching remote image URLs in parallel"
);
let cache = prefetch_remote(urls, user_agent, read_limit);
CachingResourceHandler::new(cache, inner)
}
#[cfg(test)]
mod tests {
use super::*;
fn env() -> Environment {
Environment::for_local_directory(&std::env::current_dir().unwrap()).unwrap()
}
#[test]
fn scan_deduplicates_and_filters() {
use pulldown_cmark::{CowStr, LinkType};
let events = vec![
Event::Start(Tag::Image {
link_type: LinkType::Inline,
dest_url: CowStr::Borrowed("https://example.com/a.png"),
title: CowStr::Borrowed(""),
id: CowStr::Borrowed(""),
}),
Event::Start(Tag::Image {
link_type: LinkType::Inline,
dest_url: CowStr::Borrowed("https://example.com/a.png"),
title: CowStr::Borrowed(""),
id: CowStr::Borrowed(""),
}),
Event::Start(Tag::Image {
link_type: LinkType::Inline,
dest_url: CowStr::Borrowed("./local.png"),
title: CowStr::Borrowed(""),
id: CowStr::Borrowed(""),
}),
Event::Start(Tag::Image {
link_type: LinkType::Inline,
dest_url: CowStr::Borrowed("ftp://example.com/x.png"),
title: CowStr::Borrowed(""),
id: CowStr::Borrowed(""),
}),
];
let urls = scan_remote_image_urls(&events, &env());
assert_eq!(urls.len(), 1);
assert_eq!(urls[0].as_str(), "https://example.com/a.png");
}
#[test]
fn empty_prefetch_returns_empty_cache() {
let cache = prefetch_remote(Vec::new(), "test/0.0", 1024);
assert!(cache.is_empty());
}
use std::io::ErrorKind;
#[test]
fn caching_handler_serves_cached_then_delegates() {
let mut cache = HashMap::new();
let url: Url = "https://example.com/a.png".parse().unwrap();
cache.insert(
url.clone(),
MimeData {
mime_type: None,
data: b"cached".to_vec(),
},
);
let handler = CachingResourceHandler::new(cache, super::super::NoopResourceHandler);
assert_eq!(handler.read_resource(&url).unwrap().data, b"cached");
let missing: Url = "https://example.com/b.png".parse().unwrap();
assert!(matches!(
handler.read_resource(&missing).map_err(|e| e.kind()),
Err(ErrorKind::Unsupported)
));
}
}