cached_path/
cache.rs

1use fs2::FileExt;
2use glob::glob;
3use log::{debug, error, info, warn};
4use rand::distributions::{Distribution, Uniform};
5use reqwest::blocking::{Client, ClientBuilder};
6use reqwest::header::ETAG;
7use std::default::Default;
8use std::env;
9use std::fs::{self, OpenOptions};
10use std::path::{Path, PathBuf};
11use std::thread;
12use std::time::{self, Duration};
13use tempfile::NamedTempFile;
14
15use crate::archives::{extract_archive, ArchiveFormat};
16use crate::utils::hash_str;
17use crate::{meta::Meta, Error, ProgressBar};
18
19/// Builder to facilitate creating [`Cache`] objects.
20#[derive(Debug)]
21pub struct CacheBuilder {
22    config: Config,
23}
24
25#[derive(Debug)]
26struct Config {
27    dir: Option<PathBuf>,
28    client_builder: ClientBuilder,
29    max_retries: u32,
30    max_backoff: u32,
31    freshness_lifetime: Option<u64>,
32    offline: bool,
33    progress_bar: Option<ProgressBar>,
34}
35
36impl CacheBuilder {
37    /// Construct a new `CacheBuilder`.
38    pub fn new() -> CacheBuilder {
39        CacheBuilder {
40            config: Config {
41                dir: None,
42                client_builder: ClientBuilder::new().timeout(None),
43                max_retries: 3,
44                max_backoff: 5000,
45                freshness_lifetime: None,
46                offline: false,
47                progress_bar: Some(ProgressBar::default()),
48            },
49        }
50    }
51
52    /// Construct a new `CacheBuilder` with a `ClientBuilder`.
53    pub fn with_client_builder(client_builder: ClientBuilder) -> CacheBuilder {
54        CacheBuilder::new().client_builder(client_builder)
55    }
56
57    /// Set the cache location. This can be set through the environment
58    /// variable `RUST_CACHED_PATH_ROOT`. Otherwise it will default to a subdirectory
59    /// named 'cache' of the default system temp directory.
60    pub fn dir(mut self, dir: PathBuf) -> CacheBuilder {
61        self.config.dir = Some(dir);
62        self
63    }
64
65    /// Set the `ClientBuilder`.
66    pub fn client_builder(mut self, client_builder: ClientBuilder) -> CacheBuilder {
67        self.config.client_builder = client_builder;
68        self
69    }
70
71    /// Enable a request timeout.
72    pub fn timeout(mut self, timeout: Duration) -> CacheBuilder {
73        self.config.client_builder = self.config.client_builder.timeout(timeout);
74        self
75    }
76
77    /// Enable a timeout for the connect phase of each HTTP request.
78    pub fn connect_timeout(mut self, timeout: Duration) -> CacheBuilder {
79        self.config.client_builder = self.config.client_builder.connect_timeout(timeout);
80        self
81    }
82
83    /// Set maximum number of retries for HTTP requests.
84    pub fn max_retries(mut self, max_retries: u32) -> CacheBuilder {
85        self.config.max_retries = max_retries;
86        self
87    }
88
89    /// Set the maximum backoff delay in milliseconds for retrying HTTP requests.
90    pub fn max_backoff(mut self, max_backoff: u32) -> CacheBuilder {
91        self.config.max_backoff = max_backoff;
92        self
93    }
94
95    /// Set the default freshness lifetime, in seconds. The default is None, meaning
96    /// the ETAG for an external resource will always be checked for a fresher value.
97    pub fn freshness_lifetime(mut self, freshness_lifetime: u64) -> CacheBuilder {
98        self.config.freshness_lifetime = Some(freshness_lifetime);
99        self
100    }
101
102    /// Only use offline functionality.
103    ///
104    /// If set to `true`, when the cached path of an HTTP resource is requested,
105    /// the latest cached version is returned without checking for freshness.
106    /// But if no cached versions exist, a
107    /// [`NoCachedVersions`](enum.Error.html#variant.NoCachedVersions) error is returned.
108    pub fn offline(mut self, offline: bool) -> CacheBuilder {
109        self.config.offline = offline;
110        self
111    }
112
113    /// Set the type of progress bar to use.
114    ///
115    /// The default is `Some(ProgressBar::Full)`.
116    pub fn progress_bar(mut self, progress_bar: Option<ProgressBar>) -> CacheBuilder {
117        self.config.progress_bar = progress_bar;
118        self
119    }
120
121    /// Build the `Cache` object.
122    pub fn build(self) -> Result<Cache, Error> {
123        let dir = self.config.dir.unwrap_or_else(|| {
124            if let Some(dir_str) = env::var_os("RUST_CACHED_PATH_ROOT") {
125                PathBuf::from(dir_str)
126            } else {
127                env::temp_dir().join("cache/")
128            }
129        });
130        let http_client = self.config.client_builder.build()?;
131        fs::create_dir_all(&dir)?;
132        Ok(Cache {
133            dir,
134            http_client,
135            max_retries: self.config.max_retries,
136            max_backoff: self.config.max_backoff,
137            freshness_lifetime: self.config.freshness_lifetime,
138            offline: self.config.offline,
139            progress_bar: self.config.progress_bar,
140        })
141    }
142}
143
144impl Default for CacheBuilder {
145    fn default() -> Self {
146        Self::new()
147    }
148}
149
150/// Options to use with [`Cache::cached_path_with_options`].
151#[derive(Default)]
152pub struct Options {
153    /// An optional subdirectory (relative to the cache root) to cache the resource in.
154    pub subdir: Option<String>,
155    /// Automatically extract the resource, assuming the resource is an archive.
156    pub extract: bool,
157}
158
159impl Options {
160    pub fn new(subdir: Option<&str>, extract: bool) -> Self {
161        Self {
162            subdir: subdir.map(String::from),
163            extract,
164        }
165    }
166
167    /// The the cache subdirectory to use.
168    pub fn subdir(mut self, subdir: &str) -> Self {
169        self.subdir = Some(subdir.into());
170        self
171    }
172
173    /// Treat the resource as an archive and try to extract it.
174    pub fn extract(mut self) -> Self {
175        self.extract = true;
176        self
177    }
178}
179
180/// Fetches and manages resources in a local cache directory.
181#[derive(Debug, Clone)]
182pub struct Cache {
183    /// The root directory of the cache.
184    pub dir: PathBuf,
185    /// The maximum number of times to retry downloading a remote resource.
186    max_retries: u32,
187    /// The maximum amount of time (in milliseconds) to wait before retrying a download.
188    max_backoff: u32,
189    /// An optional freshness lifetime (in seconds).
190    ///
191    /// If set, resources that were cached within the past `freshness_lifetime` seconds
192    /// will always be regarded as fresh, and so the ETag of the corresponding remote
193    /// resource won't be checked.
194    freshness_lifetime: Option<u64>,
195    /// Offline mode.
196    ///
197    /// If set to `true`, no HTTP calls will be made.
198    offline: bool,
199    /// The verbosity level of the progress bar.
200    progress_bar: Option<ProgressBar>,
201    /// The HTTP client used to fetch remote resources.
202    http_client: Client,
203}
204
205impl Cache {
206    /// Create a new `Cache` instance.
207    pub fn new() -> Result<Self, Error> {
208        Cache::builder().build()
209    }
210
211    /// Create a `CacheBuilder`.
212    pub fn builder() -> CacheBuilder {
213        CacheBuilder::new()
214    }
215
216    /// Get the cached path to a resource.
217    ///
218    /// If the resource is local file, it's path is returned. If the resource is a static HTTP
219    /// resource, it will cached locally and the path to the cache file will be returned.
220    pub fn cached_path(&self, resource: &str) -> Result<PathBuf, Error> {
221        self.cached_path_with_options(resource, &Options::default())
222    }
223
224    /// Get the cached path to a resource using the given options.
225    ///
226    /// # Examples
227    ///
228    /// Use a particular subdirectory of the cache root:
229    ///
230    /// ```rust,no_run
231    /// # use cached_path::{Cache, Options};
232    /// # let cache = Cache::new().unwrap();
233    /// # let subdir = "target";
234    /// # let resource = "README.md";
235    /// let path = cache.cached_path_with_options(
236    ///     resource,
237    ///     &Options::default().subdir(subdir),
238    /// ).unwrap();
239    /// ```
240    ///
241    /// Treat the resource as an archive and extract it. The path returned is the
242    /// path to the extraction directory:
243    ///
244    /// ```rust,no_run
245    /// # use cached_path::{Cache, Options};
246    /// # let cache = Cache::new().unwrap();
247    /// # let subdir = "target";
248    /// # let resource = "README.md";
249    /// let path = cache.cached_path_with_options(
250    ///     resource,
251    ///     &Options::default().extract(),
252    /// ).unwrap();
253    /// assert!(path.is_dir());
254    /// ```
255    pub fn cached_path_with_options(
256        &self,
257        resource: &str,
258        options: &Options,
259    ) -> Result<PathBuf, Error> {
260        let cached_path: PathBuf;
261        let mut extraction_dir: Option<PathBuf> = None;
262
263        if !resource.starts_with("http") {
264            // If resource doesn't look like a URL, treat as local path, but return
265            // an error if the path doesn't exist.
266            info!("Treating {} as local file", resource);
267            cached_path = PathBuf::from(resource);
268
269            if !cached_path.is_file() {
270                return Err(Error::ResourceNotFound(String::from(resource)));
271            }
272
273            if options.extract {
274                // If we need to extract, we extract into a unique subdirectory of the cache directory
275                // so as not to mess with the file system outside of the cache directory.
276                // To make sure that we use a unique directory for each "version" of this local
277                // resource, we treat the last modified time as an ETag.
278                let resource_last_modified = fs::metadata(resource)?
279                    .modified()
280                    .ok()
281                    .and_then(|sys_time| sys_time.elapsed().ok())
282                    .map(|duration| format!("{}", duration.as_secs()));
283                extraction_dir = Some(self.resource_to_filepath(
284                    resource,
285                    &resource_last_modified,
286                    options.subdir.as_deref(),
287                    Some("-extracted"),
288                ));
289            }
290        } else {
291            // This is a remote resource, so fetch it to the cache.
292            let meta = self.fetch_remote_resource(resource, options.subdir.as_deref())?;
293
294            // Check if we need to extract.
295            if options.extract {
296                extraction_dir = Some(meta.get_extraction_path());
297            }
298
299            cached_path = meta.resource_path;
300        }
301
302        if let Some(dirpath) = extraction_dir {
303            // Extract archive.
304            debug!("Treating {} as archive", resource);
305
306            fs::create_dir_all(dirpath.parent().unwrap())?;
307
308            // Need to acquire a lock here to make sure we don't try to extract
309            // the same archive in parallel from multiple processes.
310            debug!("Acquiring lock on extraction directory for {}", resource);
311            let lock_path = format!("{}.lock", dirpath.to_str().unwrap());
312            let filelock = OpenOptions::new()
313                .read(true)
314                .write(true)
315                .create(true)
316                .truncate(true)
317                .open(lock_path)?;
318            filelock.lock_exclusive()?;
319            debug!("Lock on extraction directory acquired for {}", resource);
320
321            if !dirpath.is_dir() {
322                info!("Extracting {} to {:?}", resource, dirpath);
323                let format = ArchiveFormat::parse_from_extension(resource)?;
324                extract_archive(&cached_path, &dirpath, &format)?;
325            }
326
327            fs2::FileExt::unlock(&filelock)?;
328            //filelock.unlock()?;
329            debug!("Lock released on extraction directory for {}", resource);
330
331            Ok(dirpath)
332        } else {
333            Ok(cached_path)
334        }
335    }
336
337    /// A convenience method to get the cached path to a resource using the given
338    /// cache subdirectory (relative to the cache root).
339    ///
340    /// This is equivalent to:
341    ///
342    /// ```rust,no_run
343    /// # use cached_path::{Cache, Options};
344    /// # let cache = Cache::new().unwrap();
345    /// # let subdir = "target";
346    /// # let resource = "README.md";
347    /// let path = cache.cached_path_with_options(
348    ///     resource,
349    ///     &Options::default().subdir(subdir),
350    /// ).unwrap();
351    /// ```
352    #[deprecated(
353        since = "0.4.4",
354        note = "Please use Cache::cached_path_with_options() instead"
355    )]
356    pub fn cached_path_in_subdir(
357        &self,
358        resource: &str,
359        subdir: Option<&str>,
360    ) -> Result<PathBuf, Error> {
361        let options = Options::new(subdir, false);
362        self.cached_path_with_options(resource, &options)
363    }
364
365    fn fetch_remote_resource(&self, resource: &str, subdir: Option<&str>) -> Result<Meta, Error> {
366        // Otherwise we attempt to parse the URL.
367        let url =
368            reqwest::Url::parse(resource).map_err(|_| Error::InvalidUrl(String::from(resource)))?;
369
370        // Ensure root directory exists in case it has changed or been removed.
371        if let Some(subdir_path) = subdir {
372            fs::create_dir_all(self.dir.join(subdir_path))?;
373        } else {
374            fs::create_dir_all(&self.dir)?;
375        };
376
377        // Find any existing cached versions of resource and check if they are still
378        // fresh according to the `freshness_lifetime` setting.
379        let versions = self.find_existing(resource, subdir); // already sorted, latest is first.
380        if self.offline {
381            if !versions.is_empty() {
382                info!("Found existing cached version of {}", resource);
383                return Ok(versions[0].clone());
384            } else {
385                error!("Offline mode is enabled but no cached versions of resource exist.");
386                return Err(Error::NoCachedVersions(String::from(resource)));
387            }
388        } else if !versions.is_empty() && versions[0].is_fresh(self.freshness_lifetime) {
389            // Oh hey, the latest version is still fresh!
390            info!("Latest cached version of {} is still fresh", resource);
391            return Ok(versions[0].clone());
392        }
393
394        // No existing version or the existing versions are older than their freshness
395        // lifetimes, so we'll query for the ETAG of the resource and then compare
396        // that with any existing versions.
397        let etag = self.try_get_etag(resource, &url)?;
398        let path = self.resource_to_filepath(resource, &etag, subdir, None);
399
400        // Before going further we need to obtain a lock on the file to provide
401        // parallel downloads of the same resource.
402        debug!("Acquiring lock for cache of {}", resource);
403        let lock_path = format!("{}.lock", path.to_str().unwrap());
404        let filelock = OpenOptions::new()
405            .read(true)
406            .write(true)
407            .create(true)
408            .truncate(true)
409            .open(lock_path)?;
410        filelock.lock_exclusive()?;
411        debug!("Lock acquired for {}", resource);
412
413        if path.exists() {
414            // Oh cool! The cache is up-to-date according to the ETAG.
415            // We'll return the up-to-date version and clean up any other
416            // dangling ones.
417            info!("Cached version of {} is up-to-date", resource);
418            //filelock.unlock()?;
419            fs2::FileExt::unlock(&filelock)?;
420            return Meta::from_cache(&path);
421        }
422
423        // No up-to-date version cached, so we have to try downloading it.
424        let meta = self.try_download_resource(resource, &url, &path, &etag)?;
425
426        info!("New version of {} cached", resource);
427
428        //filelock.unlock()?;
429        fs2::FileExt::unlock(&filelock)?;
430        debug!("Lock released for {}", resource);
431
432        Ok(meta)
433    }
434
435    /// Find existing versions of a cached resource, sorted by most recent first.
436    fn find_existing(&self, resource: &str, subdir: Option<&str>) -> Vec<Meta> {
437        let mut existing_meta: Vec<Meta> = vec![];
438        let glob_string = format!(
439            "{}*.meta",
440            self.resource_to_filepath(resource, &None, subdir, None)
441                .to_str()
442                .unwrap(),
443        );
444        for meta_path in glob(&glob_string).unwrap().filter_map(Result::ok) {
445            if let Ok(meta) = Meta::from_path(&meta_path) {
446                existing_meta.push(meta);
447            }
448        }
449        existing_meta
450            .sort_unstable_by(|a, b| b.creation_time.partial_cmp(&a.creation_time).unwrap());
451        existing_meta
452    }
453
454    fn get_retry_delay(&self, retries: u32) -> u32 {
455        let between = Uniform::from(0..1000);
456        let mut rng = rand::thread_rng();
457        std::cmp::min(
458            2u32.pow(retries - 1) * 1000 + between.sample(&mut rng),
459            self.max_backoff,
460        )
461    }
462
463    fn try_download_resource(
464        &self,
465        resource: &str,
466        url: &reqwest::Url,
467        path: &Path,
468        etag: &Option<String>,
469    ) -> Result<Meta, Error> {
470        let mut retries: u32 = 0;
471        loop {
472            match self.download_resource(resource, url, path, etag) {
473                Ok(meta) => {
474                    return Ok(meta);
475                }
476                Err(err) => {
477                    if retries >= self.max_retries {
478                        error!("Max retries exceeded for {}", resource);
479                        return Err(err);
480                    }
481                    if !err.is_retriable() {
482                        error!("Download failed for {} with fatal error, {}", resource, err);
483                        return Err(err);
484                    }
485                    retries += 1;
486                    let retry_delay = self.get_retry_delay(retries);
487                    warn!(
488                        "Download failed for {}: {}\nRetrying in {} milliseconds...",
489                        resource, err, retry_delay
490                    );
491                    thread::sleep(time::Duration::from_millis(u64::from(retry_delay)));
492                }
493            }
494        }
495    }
496
497    fn download_resource(
498        &self,
499        resource: &str,
500        url: &reqwest::Url,
501        path: &Path,
502        etag: &Option<String>,
503    ) -> Result<Meta, Error> {
504        debug!("Attempting connection to {}", url);
505
506        let mut response = self
507            .http_client
508            .get(url.clone())
509            .send()?
510            .error_for_status()?;
511
512        debug!("Opened connection to {}", url);
513
514        // First we make a temporary file and download the contents of the resource into it.
515        // Otherwise if we wrote directly to the cache file and the download got
516        // interrupted we could be left with a corrupted cache file.
517        let tempfile = NamedTempFile::new_in(path.parent().unwrap())?;
518        let mut tempfile_write_handle = OpenOptions::new().write(true).open(tempfile.path())?;
519
520        info!("Starting download of {}", url);
521
522        let bytes = if let Some(progress_bar) = &self.progress_bar {
523            let mut download_wrapper = progress_bar.wrap_download(
524                resource,
525                response.content_length(),
526                tempfile_write_handle,
527            );
528            let bytes = response.copy_to(&mut download_wrapper)?;
529            download_wrapper.finish();
530            bytes
531        } else {
532            response.copy_to(&mut tempfile_write_handle)?
533        };
534
535        info!("Downloaded {} bytes", bytes);
536        debug!("Writing meta file");
537
538        let meta = Meta::new(
539            String::from(resource),
540            path.into(),
541            etag.clone(),
542            self.freshness_lifetime,
543        );
544        meta.to_file()?;
545
546        debug!("Renaming temp file to cache location for {}", url);
547
548        fs::rename(tempfile.path(), path)?;
549
550        Ok(meta)
551    }
552
553    fn try_get_etag(&self, resource: &str, url: &reqwest::Url) -> Result<Option<String>, Error> {
554        let mut retries: u32 = 0;
555        loop {
556            match self.get_etag(url) {
557                Ok(etag) => return Ok(etag),
558                Err(err) => {
559                    if retries >= self.max_retries {
560                        error!("Max retries exceeded for {}", resource);
561                        return Err(err);
562                    }
563                    if !err.is_retriable() {
564                        error!("ETAG fetch for {} failed with fatal error", resource);
565                        return Err(err);
566                    }
567                    retries += 1;
568                    let retry_delay = self.get_retry_delay(retries);
569                    warn!(
570                        "ETAG fetch failed for {}, retrying in {} milliseconds...",
571                        resource, retry_delay
572                    );
573                    thread::sleep(time::Duration::from_millis(u64::from(retry_delay)));
574                }
575            }
576        }
577    }
578
579    fn get_etag(&self, url: &reqwest::Url) -> Result<Option<String>, Error> {
580        debug!("Fetching ETAG for {}", url);
581        let response = self
582            .http_client
583            .head(url.clone())
584            .send()?
585            .error_for_status()?;
586        if let Some(etag) = response.headers().get(ETAG) {
587            if let Ok(s) = etag.to_str() {
588                Ok(Some(s.into()))
589            } else {
590                debug!("No ETAG for {}", url);
591                Ok(None)
592            }
593        } else {
594            Ok(None)
595        }
596    }
597
598    fn resource_to_filepath(
599        &self,
600        resource: &str,
601        etag: &Option<String>,
602        subdir: Option<&str>,
603        suffix: Option<&str>,
604    ) -> PathBuf {
605        let resource_hash = hash_str(resource);
606        let mut filename = if let Some(tag) = etag {
607            let etag_hash = hash_str(&tag[..]);
608            format!("{}.{}", resource_hash, etag_hash)
609        } else {
610            resource_hash
611        };
612
613        if let Some(suf) = suffix {
614            filename.push_str(suf);
615        }
616
617        let filepath = PathBuf::from(filename);
618
619        if let Some(subdir_path) = subdir {
620            self.dir.join(subdir_path).join(filepath)
621        } else {
622            self.dir.join(filepath)
623        }
624    }
625}
626
627#[cfg(test)]
628mod tests {
629    use super::*;
630    use tempfile::tempdir;
631
632    #[test]
633    fn test_url_to_filename_with_etag() {
634        let cache_dir = tempdir().unwrap();
635        let cache = Cache::builder()
636            .dir(cache_dir.path().to_owned())
637            .build()
638            .unwrap();
639
640        let resource = "http://localhost:5000/foo.txt";
641        let etag = String::from("abcd");
642
643        assert_eq!(
644            cache
645                .resource_to_filepath(resource, &Some(etag), None, None)
646                .to_str()
647                .unwrap(),
648            format!(
649                "{}{}{}.{}",
650                cache_dir.path().to_str().unwrap(),
651                std::path::MAIN_SEPARATOR,
652                "b5696dbf866311125e26a62bef0125854dd40f010a70be9cfd23634c997c1874",
653                "88d4266fd4e6338d13b845fcf289579d209c897823b9217da3e161936f031589"
654            )
655        );
656    }
657
658    #[test]
659    fn test_url_to_filename_no_etag() {
660        let cache_dir = tempdir().unwrap();
661        let cache = Cache::builder()
662            .dir(cache_dir.path().to_owned())
663            .build()
664            .unwrap();
665
666        let resource = "http://localhost:5000/foo.txt";
667        assert_eq!(
668            cache
669                .resource_to_filepath(resource, &None, None, None)
670                .to_str()
671                .unwrap(),
672            format!(
673                "{}{}{}",
674                cache_dir.path().to_str().unwrap(),
675                std::path::MAIN_SEPARATOR,
676                "b5696dbf866311125e26a62bef0125854dd40f010a70be9cfd23634c997c1874",
677            )
678        );
679    }
680
681    #[test]
682    fn test_url_to_filename_in_subdir() {
683        let cache_dir = tempdir().unwrap();
684        let cache = Cache::builder()
685            .dir(cache_dir.path().to_owned())
686            .build()
687            .unwrap();
688
689        let resource = "http://localhost:5000/foo.txt";
690        assert_eq!(
691            cache
692                .resource_to_filepath(resource, &None, Some("target"), None)
693                .to_str()
694                .unwrap(),
695            format!(
696                "{}{}{}{}{}",
697                cache_dir.path().to_str().unwrap(),
698                std::path::MAIN_SEPARATOR,
699                "target",
700                std::path::MAIN_SEPARATOR,
701                "b5696dbf866311125e26a62bef0125854dd40f010a70be9cfd23634c997c1874",
702            )
703        );
704    }
705
706    #[test]
707    fn test_url_to_filename_with_suffix() {
708        let cache_dir = tempdir().unwrap();
709        let cache = Cache::builder()
710            .dir(cache_dir.path().to_owned())
711            .build()
712            .unwrap();
713
714        let resource = "http://localhost:5000/foo.txt";
715        assert_eq!(
716            cache
717                .resource_to_filepath(resource, &None, Some("target"), Some("-extracted"))
718                .to_str()
719                .unwrap(),
720            format!(
721                "{}{}{}{}{}-extracted",
722                cache_dir.path().to_str().unwrap(),
723                std::path::MAIN_SEPARATOR,
724                "target",
725                std::path::MAIN_SEPARATOR,
726                "b5696dbf866311125e26a62bef0125854dd40f010a70be9cfd23634c997c1874",
727            )
728        );
729    }
730}