mdcat/resources/
prefetch.rs1use std::collections::{HashMap, HashSet};
12use std::io::Result;
13use std::sync::mpsc;
14use std::thread;
15
16use pulldown_cmark::{Event, Tag};
17use tracing::{event, Level};
18use url::Url;
19
20use super::curl::CurlResourceHandler;
21use super::{MimeData, ResourceUrlHandler};
22use crate::references::UrlBase;
23use crate::Environment;
24
25const MAX_PARALLEL_FETCHES: usize = 8;
29
30#[must_use]
36pub fn scan_remote_image_urls(events: &[Event<'_>], env: &Environment) -> Vec<Url> {
37 let mut seen = HashSet::new();
38 let mut urls = Vec::new();
39 for event in events {
40 let Event::Start(Tag::Image { dest_url, .. }) = event else {
41 continue;
42 };
43 let Some(resolved) = env.resolve_reference(dest_url) else {
44 continue;
45 };
46 if !matches!(resolved.scheme(), "http" | "https") {
47 continue;
48 }
49 if seen.insert(resolved.clone()) {
50 urls.push(resolved);
51 }
52 }
53 urls
54}
55
56#[must_use]
61pub fn prefetch_remote(
62 urls: Vec<Url>,
63 user_agent: &'static str,
64 read_limit: u64,
65) -> HashMap<Url, MimeData> {
66 if urls.is_empty() {
67 return HashMap::new();
68 }
69 let (tx, rx) = mpsc::channel::<(Url, Result<MimeData>)>();
70 let mut handles = Vec::with_capacity(urls.len().min(MAX_PARALLEL_FETCHES));
71 for chunk in urls.chunks(MAX_PARALLEL_FETCHES.max(1)) {
72 for url in chunk {
73 let tx = tx.clone();
74 let url = url.clone();
75 handles.push(thread::spawn(move || {
76 let result = CurlResourceHandler::create(read_limit, user_agent)
77 .and_then(|h| h.read_resource(&url));
78 let _ = tx.send((url, result));
79 }));
80 if handles.len() >= MAX_PARALLEL_FETCHES {
81 break;
82 }
83 }
84 }
85 drop(tx);
86 let mut cache = HashMap::new();
87 for (url, result) in rx {
88 match result {
89 Ok(data) => {
90 cache.insert(url, data);
91 }
92 Err(err) => {
93 event!(Level::DEBUG, %url, %err, "prefetch failed, falling through");
94 }
95 }
96 }
97 for handle in handles {
98 let _ = handle.join();
99 }
100 cache
101}
102
103pub struct CachingResourceHandler<H: ResourceUrlHandler> {
106 cache: HashMap<Url, MimeData>,
107 inner: H,
108}
109
110impl<H: ResourceUrlHandler> CachingResourceHandler<H> {
111 pub fn new(cache: HashMap<Url, MimeData>, inner: H) -> Self {
113 Self { cache, inner }
114 }
115
116 pub fn passthrough(inner: H) -> Self {
118 Self::new(HashMap::new(), inner)
119 }
120}
121
122impl<H: ResourceUrlHandler> ResourceUrlHandler for CachingResourceHandler<H> {
123 fn read_resource(&self, url: &Url) -> Result<MimeData> {
124 if let Some(data) = self.cache.get(url) {
125 return Ok(data.clone());
126 }
127 self.inner.read_resource(url)
128 }
129}
130
131pub fn prefetch_and_wrap<H: ResourceUrlHandler>(
134 events: &[Event<'_>],
135 env: &Environment,
136 user_agent: &'static str,
137 read_limit: u64,
138 inner: H,
139) -> CachingResourceHandler<H> {
140 let urls = scan_remote_image_urls(events, env);
141 if urls.is_empty() {
142 return CachingResourceHandler::passthrough(inner);
143 }
144 event!(
145 Level::DEBUG,
146 count = urls.len(),
147 "prefetching remote image URLs in parallel"
148 );
149 let cache = prefetch_remote(urls, user_agent, read_limit);
150 CachingResourceHandler::new(cache, inner)
151}
152
153#[cfg(test)]
154mod tests {
155 use super::*;
156
157 fn env() -> Environment {
158 Environment::for_local_directory(&std::env::current_dir().unwrap()).unwrap()
159 }
160
161 #[test]
162 fn scan_deduplicates_and_filters() {
163 use pulldown_cmark::{CowStr, LinkType};
164 let events = vec![
165 Event::Start(Tag::Image {
166 link_type: LinkType::Inline,
167 dest_url: CowStr::Borrowed("https://example.com/a.png"),
168 title: CowStr::Borrowed(""),
169 id: CowStr::Borrowed(""),
170 }),
171 Event::Start(Tag::Image {
172 link_type: LinkType::Inline,
173 dest_url: CowStr::Borrowed("https://example.com/a.png"),
174 title: CowStr::Borrowed(""),
175 id: CowStr::Borrowed(""),
176 }),
177 Event::Start(Tag::Image {
178 link_type: LinkType::Inline,
179 dest_url: CowStr::Borrowed("./local.png"),
180 title: CowStr::Borrowed(""),
181 id: CowStr::Borrowed(""),
182 }),
183 Event::Start(Tag::Image {
184 link_type: LinkType::Inline,
185 dest_url: CowStr::Borrowed("ftp://example.com/x.png"),
186 title: CowStr::Borrowed(""),
187 id: CowStr::Borrowed(""),
188 }),
189 ];
190 let urls = scan_remote_image_urls(&events, &env());
191 assert_eq!(urls.len(), 1);
192 assert_eq!(urls[0].as_str(), "https://example.com/a.png");
193 }
194
195 #[test]
196 fn empty_prefetch_returns_empty_cache() {
197 let cache = prefetch_remote(Vec::new(), "test/0.0", 1024);
198 assert!(cache.is_empty());
199 }
200
201 use std::io::ErrorKind;
202
203 #[test]
204 fn caching_handler_serves_cached_then_delegates() {
205 let mut cache = HashMap::new();
206 let url: Url = "https://example.com/a.png".parse().unwrap();
207 cache.insert(
208 url.clone(),
209 MimeData {
210 mime_type: None,
211 data: b"cached".to_vec(),
212 },
213 );
214 let handler = CachingResourceHandler::new(cache, super::super::NoopResourceHandler);
215 assert_eq!(handler.read_resource(&url).unwrap().data, b"cached");
216 let missing: Url = "https://example.com/b.png".parse().unwrap();
217 assert!(matches!(
218 handler.read_resource(&missing).map_err(|e| e.kind()),
219 Err(ErrorKind::Unsupported)
220 ));
221 }
222}