Skip to main content

mdcat/resources/
prefetch.rs

1// This Source Code Form is subject to the terms of the Mozilla Public
2// License, v. 2.0. If a copy of the MPL was not distributed with this
3// file, You can obtain one at http://mozilla.org/MPL/2.0/.
4
5//! Parallel prefetch for remote image URLs.
6//!
7//! Scan the event stream for unique remote URLs, fan fetches out to
8//! worker threads (capped at [`MAX_PARALLEL_FETCHES`]), cache the
9//! results in a [`CachingResourceHandler`].
10
11use std::collections::{HashMap, HashSet};
12use std::io::Result;
13use std::sync::mpsc;
14use std::thread;
15
16use pulldown_cmark::{Event, Tag};
17use tracing::{event, Level};
18use url::Url;
19
20use super::curl::CurlResourceHandler;
21use super::{MimeData, ResourceUrlHandler};
22use crate::references::UrlBase;
23use crate::Environment;
24
25/// Upper bound on threads fanned out for a single render. Prevents a
26/// pathological markdown document with hundreds of image URLs from
27/// spawning hundreds of threads against one remote host.
28const MAX_PARALLEL_FETCHES: usize = 8;
29
30/// Collect unique `http` / `https` image URLs referenced by `events`.
31///
32/// Relative URLs are resolved against `env.base_url`. Duplicates are
33/// deduplicated so a README that references the same badge twice
34/// still only fetches it once.
35#[must_use]
36pub fn scan_remote_image_urls(events: &[Event<'_>], env: &Environment) -> Vec<Url> {
37    let mut seen = HashSet::new();
38    let mut urls = Vec::new();
39    for event in events {
40        let Event::Start(Tag::Image { dest_url, .. }) = event else {
41            continue;
42        };
43        let Some(resolved) = env.resolve_reference(dest_url) else {
44            continue;
45        };
46        if !matches!(resolved.scheme(), "http" | "https") {
47            continue;
48        }
49        if seen.insert(resolved.clone()) {
50            urls.push(resolved);
51        }
52    }
53    urls
54}
55
56/// Fetch every URL in `urls` in parallel and return the results keyed
57/// by URL. Individual fetch failures are logged and dropped rather
58/// than propagated — the render can still fall back to rendering the
59/// image as a link, which is what we'd do on a network error anyway.
60#[must_use]
61pub fn prefetch_remote(
62    urls: Vec<Url>,
63    user_agent: &'static str,
64    read_limit: u64,
65) -> HashMap<Url, MimeData> {
66    if urls.is_empty() {
67        return HashMap::new();
68    }
69    let (tx, rx) = mpsc::channel::<(Url, Result<MimeData>)>();
70    let mut handles = Vec::with_capacity(urls.len().min(MAX_PARALLEL_FETCHES));
71    for chunk in urls.chunks(MAX_PARALLEL_FETCHES.max(1)) {
72        for url in chunk {
73            let tx = tx.clone();
74            let url = url.clone();
75            handles.push(thread::spawn(move || {
76                let result = CurlResourceHandler::create(read_limit, user_agent)
77                    .and_then(|h| h.read_resource(&url));
78                let _ = tx.send((url, result));
79            }));
80            if handles.len() >= MAX_PARALLEL_FETCHES {
81                break;
82            }
83        }
84    }
85    drop(tx);
86    let mut cache = HashMap::new();
87    for (url, result) in rx {
88        match result {
89            Ok(data) => {
90                cache.insert(url, data);
91            }
92            Err(err) => {
93                event!(Level::DEBUG, %url, %err, "prefetch failed, falling through");
94            }
95        }
96    }
97    for handle in handles {
98        let _ = handle.join();
99    }
100    cache
101}
102
103/// Resource handler that serves prefetched URL bytes first, then
104/// delegates everything else to an inner handler.
105pub struct CachingResourceHandler<H: ResourceUrlHandler> {
106    cache: HashMap<Url, MimeData>,
107    inner: H,
108}
109
110impl<H: ResourceUrlHandler> CachingResourceHandler<H> {
111    /// Wrap `inner` with the given prefetched cache.
112    pub fn new(cache: HashMap<Url, MimeData>, inner: H) -> Self {
113        Self { cache, inner }
114    }
115
116    /// Wrap `inner` with an empty cache (every read falls through).
117    pub fn passthrough(inner: H) -> Self {
118        Self::new(HashMap::new(), inner)
119    }
120}
121
122impl<H: ResourceUrlHandler> ResourceUrlHandler for CachingResourceHandler<H> {
123    fn read_resource(&self, url: &Url) -> Result<MimeData> {
124        if let Some(data) = self.cache.get(url) {
125            return Ok(data.clone());
126        }
127        self.inner.read_resource(url)
128    }
129}
130
131/// Convenience: scan + prefetch in one call. Returns a wrapping
132/// handler that the render pipeline can use transparently.
133pub fn prefetch_and_wrap<H: ResourceUrlHandler>(
134    events: &[Event<'_>],
135    env: &Environment,
136    user_agent: &'static str,
137    read_limit: u64,
138    inner: H,
139) -> CachingResourceHandler<H> {
140    let urls = scan_remote_image_urls(events, env);
141    if urls.is_empty() {
142        return CachingResourceHandler::passthrough(inner);
143    }
144    event!(
145        Level::DEBUG,
146        count = urls.len(),
147        "prefetching remote image URLs in parallel"
148    );
149    let cache = prefetch_remote(urls, user_agent, read_limit);
150    CachingResourceHandler::new(cache, inner)
151}
152
153#[cfg(test)]
154mod tests {
155    use super::*;
156
157    fn env() -> Environment {
158        Environment::for_local_directory(&std::env::current_dir().unwrap()).unwrap()
159    }
160
161    #[test]
162    fn scan_deduplicates_and_filters() {
163        use pulldown_cmark::{CowStr, LinkType};
164        let events = vec![
165            Event::Start(Tag::Image {
166                link_type: LinkType::Inline,
167                dest_url: CowStr::Borrowed("https://example.com/a.png"),
168                title: CowStr::Borrowed(""),
169                id: CowStr::Borrowed(""),
170            }),
171            Event::Start(Tag::Image {
172                link_type: LinkType::Inline,
173                dest_url: CowStr::Borrowed("https://example.com/a.png"),
174                title: CowStr::Borrowed(""),
175                id: CowStr::Borrowed(""),
176            }),
177            Event::Start(Tag::Image {
178                link_type: LinkType::Inline,
179                dest_url: CowStr::Borrowed("./local.png"),
180                title: CowStr::Borrowed(""),
181                id: CowStr::Borrowed(""),
182            }),
183            Event::Start(Tag::Image {
184                link_type: LinkType::Inline,
185                dest_url: CowStr::Borrowed("ftp://example.com/x.png"),
186                title: CowStr::Borrowed(""),
187                id: CowStr::Borrowed(""),
188            }),
189        ];
190        let urls = scan_remote_image_urls(&events, &env());
191        assert_eq!(urls.len(), 1);
192        assert_eq!(urls[0].as_str(), "https://example.com/a.png");
193    }
194
195    #[test]
196    fn empty_prefetch_returns_empty_cache() {
197        let cache = prefetch_remote(Vec::new(), "test/0.0", 1024);
198        assert!(cache.is_empty());
199    }
200
201    use std::io::ErrorKind;
202
203    #[test]
204    fn caching_handler_serves_cached_then_delegates() {
205        let mut cache = HashMap::new();
206        let url: Url = "https://example.com/a.png".parse().unwrap();
207        cache.insert(
208            url.clone(),
209            MimeData {
210                mime_type: None,
211                data: b"cached".to_vec(),
212            },
213        );
214        let handler = CachingResourceHandler::new(cache, super::super::NoopResourceHandler);
215        assert_eq!(handler.read_resource(&url).unwrap().data, b"cached");
216        let missing: Url = "https://example.com/b.png".parse().unwrap();
217        assert!(matches!(
218            handler.read_resource(&missing).map_err(|e| e.kind()),
219            Err(ErrorKind::Unsupported)
220        ));
221    }
222}