fetch_data/
lib.rs

1#![warn(clippy::all)]
2#![warn(clippy::pedantic)]
3#![warn(clippy::nursery)]
4#![warn(missing_docs)]
5#![allow(clippy::missing_errors_doc)]
6#![doc = include_str!("../README.md")]
7
8use anyinput::anyinput;
9/// Used to construct global `FetchData` instance.
10///
11/// This is a re-export from crate [`ctor`](https://crates.io/crates/ctor).
12pub use ctor::ctor;
13use directories::ProjectDirs;
14
15use sha2::{Digest, Sha256};
16use std::{
17    collections::HashMap,
18    fs::{self, read_dir, File},
19    path::PathBuf,
20    sync::Mutex,
21};
22use thiserror::Error;
23
24/// Used to fetch data files from a URL, if needed. It verifies file contents via a hash.
25///
26/// # Thread Safety
27///
28/// `FetchData` works well with multithreaded testing, It is thread safe (via a Mutex).
29///
30pub struct FetchData {
31    mutex: Mutex<Result<Internals, Box<FetchDataError>>>,
32}
33
34impl FetchData {
35    /// Create a new FetchData object.
36    ///
37    /// # Errors
38    ///
39    /// To make `FetchData` work well as a static global, `new` never fails. Instead, `FetchData` stores any error
40    /// and returns it when the first call to `fetch_file`, etc., is made.
41    ///
42    /// # Arguments
43    ///  *all inputs are string-like*
44    ///
45    /// * `registry_contents` - Whitespace delimited list of files and hashes.
46    ///           Use Rust's [`std::include_str`](https://doc.rust-lang.org/std/macro.include_str.html)
47    ///           macro to include the contents of a file.
48    /// * `url_root` - Base URL for remote files.
49    /// * `env_key` - Environment variable that may contain the path to the data directory.
50    ///           If not set, the data directory will be create via
51    ///           [`ProjectDirs`](https://docs.rs/directories/latest/directories/struct.ProjectDirs.html#method.from_path)
52    ///           and the next three arguments.
53    /// * `qualifier` - The reverse domain name notation of the application, excluding the organization or application name itself.
54    /// * `organization` - The name of the organization that develops this application.
55    /// * `application` - The name of the application itself.
56    ///
57    /// # Example
58    /// ```
59    /// use fetch_data::{FetchData};
60    ///
61    /// // Create a new FetchData instance.
62    /// let fetch_data = FetchData::new(
63    ///     "small.fam 36e0086c0353ff336d0533330dbacb12c75e37dc3cba174313635b98dfe86ed2
64    ///      small.bim 56b6657a3766e2e52273f89d28be6135f9424ca1d204d29f3fa1c5a90eca794e",
65    ///     "https://raw.githubusercontent.com/CarlKCarlK/fetch-data/main/tests/data/",
66    ///     "BAR_APP_DATA_DIR",
67    ///     "com",
68    ///     "Foo Corp",
69    ///     "Bar App",
70    ///     );
71    ///
72    /// // If the local file exists and has the right hash, just return its path.
73    /// // Otherwise, download the file, confirm its hash, and return its path.
74    /// let local_path = fetch_data.fetch_file("small.bim")?;
75    /// assert!(local_path.exists());
76    /// # use fetch_data::FetchDataError;
77    /// # Ok::<(), Box<FetchDataError>>(())
78    /// ```
79    #[anyinput]
80    pub fn new(
81        registry_contents: AnyString,
82        url_root: AnyString,
83        env_key: AnyString,
84        qualifier: AnyString,
85        organization: AnyString,
86        application: AnyString,
87    ) -> Self {
88        Self {
89            mutex: Mutex::new(Internals::new(
90                registry_contents,
91                url_root,
92                env_key,
93                qualifier,
94                organization,
95                application,
96            )),
97        }
98    }
99
100    fn lock(&self) -> std::sync::MutexGuard<Result<Internals, Box<FetchDataError>>> {
101        match self.mutex.lock() {
102            Ok(lock) => lock,
103            Err(err) => err.into_inner(),
104        }
105    }
106
107    /// Fetch data files from a URL, but only if needed. Verify contents via a hash.
108    ///
109    /// # Example
110    /// ```
111    /// use fetch_data::{FetchData};
112    ///
113    /// // Create a new FetchData object.
114    /// let fetch_data = FetchData::new(
115    ///     "small.fam 36e0086c0353ff336d0533330dbacb12c75e37dc3cba174313635b98dfe86ed2
116    ///      small.bim 56b6657a3766e2e52273f89d28be6135f9424ca1d204d29f3fa1c5a90eca794e",
117    ///     "https://raw.githubusercontent.com/CarlKCarlK/fetch-data/main/tests/data/",
118    ///     "BAR_APP_DATA_DIR",
119    ///     "com",
120    ///     "Foo Corp",
121    ///     "Bar App",
122    ///     );
123    ///
124    /// // If the local file exists and has the right hash, just return its path.
125    /// // Otherwise, download the file, confirm its hash, and return its path.
126    /// let local_path = fetch_data.fetch_file("small.bim")?;
127    /// assert!(local_path.exists());
128    /// # use fetch_data::FetchDataError;
129    /// # Ok::<(), Box<FetchDataError>>(())
130    /// ```
131    #[anyinput]
132    pub fn fetch_file(&self, path: AnyPath) -> Result<PathBuf, Box<FetchDataError>> {
133        let path_list = vec![path.to_path_buf()];
134        let vec = self.fetch_files(path_list)?;
135        Ok(vec[0].clone())
136    }
137
138    /// Given a list of files, returns a list of their local paths. If necessary, the files will be downloaded.
139    ///
140    /// # Example
141    /// ```
142    /// use fetch_data::{FetchData};
143    ///
144    /// // Create a new FetchData instance.
145    /// let fetch_data = FetchData::new(
146    ///     "small.fam 36e0086c0353ff336d0533330dbacb12c75e37dc3cba174313635b98dfe86ed2
147    ///      small.bim 56b6657a3766e2e52273f89d28be6135f9424ca1d204d29f3fa1c5a90eca794e",
148    ///     "https://raw.githubusercontent.com/CarlKCarlK/fetch-data/main/tests/data/",
149    ///     "BAR_APP_DATA_DIR",
150    ///     "com",
151    ///     "Foo Corp",
152    ///     "Bar App",
153    ///     );
154    ///
155    /// // If a local file exists and has the right hash, just return its path
156    /// // in a list. Otherwise, download the file, confirm its hash, and return
157    /// //  its path in the list.
158    /// let local_path_list = fetch_data.fetch_files(["small.bim", "small.bim"])?;
159    /// assert!(local_path_list[0].exists() && local_path_list[1].exists());
160    /// # use fetch_data::FetchDataError;
161    /// # Ok::<(), Box<FetchDataError>>(())
162    /// ```
163    #[anyinput]
164    #[allow(clippy::significant_drop_tightening)]
165    pub fn fetch_files(
166        &self,
167        path_list: AnyIter<AnyPath>,
168    ) -> Result<Vec<PathBuf>, Box<FetchDataError>> {
169        let lock = self.lock();
170        // Convert Result to reference the error inside the Box for compatibility with internals()
171        let lock_ref = lock.as_ref().map_err(|e| &**e);
172        let internals = Self::internals(lock_ref)?;
173        let hash_registry = &internals.hash_registry;
174        let cache_dir = &internals.cache_dir;
175        let url_root = &internals.url_root;
176
177        let mut local_list: Vec<PathBuf> = Vec::new();
178        for path in path_list {
179            let path = path.as_ref();
180
181            let path_as_string = path.to_str().ok_or_else(|| {
182                Box::new(FetchDataSpecificError::UnknownOrBadFile("???".to_string()).into())
183            })?;
184
185            let Some(hash) = hash_registry.get(path) else {
186                return Err(Box::new(
187                    FetchDataSpecificError::UnknownOrBadFile(path_as_string.to_string()).into(),
188                ));
189            };
190
191            let local_path = cache_dir.join(path);
192            let url = format!("{url_root}{path_as_string}");
193            fetch(url, hash, &local_path)?;
194            local_list.push(local_path);
195        }
196
197        Ok(local_list)
198    }
199
200    fn internals<'a>(
201        lock_ref: Result<&'a Internals, &FetchDataError>,
202    ) -> Result<&'a Internals, Box<FetchDataError>> {
203        match lock_ref {
204            Ok(internals) => Ok(internals),
205            Err(e) => Err(Box::new(
206                FetchDataSpecificError::FetchDataNewFailed(e.to_string()).into(),
207            )),
208        }
209    }
210    /// Compute registry contents by downloading items and hashing them.
211    ///
212    /// # Tips
213    ///
214    /// * If you put the returned contents into a file, you can use Rust's [`std::include_str`](https://doc.rust-lang.org/std/macro.include_str.html)
215    ///   macro to include the contents of that file in [`FetchData::new`](struct.FetchData.html#method.new).
216    ///
217    /// * Use utility function [`fetch_data::dir_to_file_list`](fn.dir_to_file_list.html) to create a list of files in any local directory.
218    /// Note the hash is computed on download files, not any original local files.
219    ///
220    /// # Example
221    ///
222    /// ```
223    /// use fetch_data::{FetchData};
224    ///
225    /// // Create a new FetchData object.
226    /// let fetch_data = FetchData::new(
227    ///     "", // ignored
228    ///     "https://raw.githubusercontent.com/CarlKCarlK/fetch-data/main/tests/data/",
229    ///     "BAR_APP_DATA_DIR",
230    ///     "com",
231    ///     "Foo Corp",
232    ///     "Bar App",
233    ///     );
234    ///
235    /// // Even if local files exist, download each file. Hash each file. Return the results as a string.
236    /// let registry_contents = fetch_data.gen_registry_contents(["small.fam", "small.bim"])?;
237    /// println!("{registry_contents}"); // Prints:
238    ///                                  // small.fam 36e0086c0353ff336d0533330dbacb12c75e37dc3cba174313635b98dfe86ed2
239    ///                                  // small.bim 56b6657a3766e2e52273f89d28be6135f9424ca1d204d29f3fa1c5a90eca794e
240    /// # use fetch_data::FetchDataError;
241    /// # Ok::<(), Box<FetchDataError>>(())
242    /// ```
243    #[anyinput]
244    #[allow(clippy::significant_drop_tightening)]
245    pub fn gen_registry_contents(
246        &self,
247        path_list: AnyIter<AnyPath>,
248    ) -> Result<String, Box<FetchDataError>> {
249        let lock = self.lock();
250        // Convert Result to reference the error inside the Box for compatibility with internals()
251        let lock_ref = lock.as_ref().map_err(|e| &**e);
252        let internals = Self::internals(lock_ref)?;
253        let cache_dir = &internals.cache_dir;
254        let url_root = &internals.url_root;
255
256        let mut s = String::new();
257        for path in path_list {
258            let path = path.as_ref();
259
260            let Some(path_as_string) = path.to_str() else {
261                return Err(Box::new(
262                    FetchDataSpecificError::UnknownOrBadFile("???".to_string()).into(),
263                ));
264            };
265
266            let local_path = cache_dir.join(path);
267            let url = format!("{url_root}{path_as_string}");
268            download(url, &local_path)?;
269            let hash = hash_file(&local_path)?;
270            s.push_str(&format!("{} {hash}\n", path.display()));
271        }
272
273        Ok(s)
274    }
275
276    /// Return the path to the local cache directory.
277    #[allow(clippy::significant_drop_tightening)]
278    pub fn cache_dir(&self) -> Result<PathBuf, Box<FetchDataError>> {
279        let lock = self.lock();
280        // Convert Result to reference the error inside the Box for compatibility with internals()
281        let lock_ref = lock.as_ref().map_err(|e| &**e);
282        let internals = Self::internals(lock_ref)?;
283        let cache_dir = &internals.cache_dir;
284        Ok(cache_dir.to_owned())
285    }
286}
287
288/// All possible errors returned by this crate and the crates it depends on.
289// Based on `<https://nick.groenen.me/posts/rust-error-handling/#the-library-error-type>`
290#[derive(Error, Debug)]
291pub enum FetchDataError {
292    #[allow(missing_docs)]
293    #[error(transparent)]
294    FetchDataError(#[from] FetchDataSpecificError),
295
296    #[allow(missing_docs)]
297    #[error(transparent)]
298    IOError(#[from] std::io::Error),
299
300    #[allow(missing_docs)]
301    #[error(transparent)]
302    UreqError(#[from] ureq::Error),
303}
304/// All errors specific to this crate.
305#[derive(Error, Debug, Clone)]
306pub enum FetchDataSpecificError {
307    #[allow(missing_docs)]
308    #[error("Unknown or bad file '{0}'")]
309    UnknownOrBadFile(String),
310
311    #[allow(missing_docs)]
312    #[error("The registry of files is invalid")]
313    RegistryProblem(),
314
315    #[allow(missing_docs)]
316    #[error("FetchData new failed with error: {0}")]
317    FetchDataNewFailed(String),
318
319    #[allow(missing_docs)]
320    #[error("Downloaded file not seen: {0}")]
321    DownloadedFileNotSeen(String),
322
323    #[allow(missing_docs)]
324    #[error("Downloaded file has wrong hash: {0},expected: {1}, actual: {2}")]
325    DownloadedFileWrongHash(String, String, String),
326
327    #[allow(missing_docs)]
328    #[error("Cannot create cache directory")]
329    CannotCreateCacheDir(),
330}
331
332/// If necessary, retrieve a file from a URL, checking its hash.
333/// # Example
334/// ```
335/// use fetch_data::fetch;
336/// use temp_testdir::TempDir;
337///
338/// // Create a temporary local directory.
339/// let temp_dir = TempDir::default();
340/// // Download the file and check its hash.
341/// let path = temp_dir.join("small.fam");
342/// fetch(
343///     "https://raw.githubusercontent.com/CarlKCarlK/fetch-data/main/tests/data/small.fam",
344///     "36e0086c0353ff336d0533330dbacb12c75e37dc3cba174313635b98dfe86ed2",
345///     &path,
346/// )?;
347/// assert!(&path.exists());
348/// // This time, because the local file exists and has the correct hash, no download is performed.
349/// fetch(
350///     "https://raw.githubusercontent.com/CarlKCarlK/fetch-data/main/tests/data/small.fam",
351///     "36e0086c0353ff336d0533330dbacb12c75e37dc3cba174313635b98dfe86ed2",
352///     &path,
353/// )?;
354/// assert!(&path.exists());
355/// # use fetch_data::FetchDataError;
356/// # Ok::<(), Box<FetchDataError>>(())
357/// ```
358#[anyinput]
359pub fn fetch(url: AnyString, hash: AnyString, path: AnyPath) -> Result<(), Box<FetchDataError>> {
360    if !path.exists() {
361        download(url, path)?;
362    }
363    let actual_hash = hash_file(path)?;
364    if !actual_hash.eq(hash) {
365        return Err(Box::new(
366            FetchDataSpecificError::DownloadedFileWrongHash(
367                path.display().to_string(),
368                hash.to_string(),
369                actual_hash,
370            )
371            .into(),
372        ));
373    }
374    Ok(())
375}
376
377/// Download a file from a URL and compute its hash.
378///
379/// # Example
380/// ```
381/// use fetch_data::hash_download;
382/// use temp_testdir::TempDir;
383///
384/// // Create a temporary local directory.
385/// let temp_dir = TempDir::default();
386/// let path = temp_dir.join("small.fam");
387/// // Download a file and compute its hash.
388/// let hash = hash_download(
389///     "https://raw.githubusercontent.com/CarlKCarlK/fetch-data/main/tests/data/small.fam",
390///    &path,
391/// )?;
392/// assert!(hash.eq("36e0086c0353ff336d0533330dbacb12c75e37dc3cba174313635b98dfe86ed2"));
393/// # use fetch_data::FetchDataError;
394/// # Ok::<(), Box<FetchDataError>>(())
395/// ```
396#[anyinput]
397pub fn hash_download(url: AnyString, path: AnyPath) -> Result<String, Box<FetchDataError>> {
398    download(url, path)?;
399    hash_file(path)
400}
401
402/// Compute the hash (SHA256) of a local file.
403///
404/// # Example
405/// ```
406/// use fetch_data::{hash_file, download};
407/// use temp_testdir::TempDir;
408///
409/// // Download a file to a temporary directory.
410/// let temp_dir = TempDir::default();
411/// let path = temp_dir.join("small.fam");
412/// download(
413///     "https://raw.githubusercontent.com/CarlKCarlK/fetch-data/main/tests/data/small.fam",
414///     &path,
415/// )?;
416/// // Compute the hash of the file.
417/// let hash = hash_file(&path)?;
418/// assert!(hash.eq("36e0086c0353ff336d0533330dbacb12c75e37dc3cba174313635b98dfe86ed2"));
419/// # use fetch_data::FetchDataError;
420/// # Ok::<(), Box<FetchDataError>>(())
421#[anyinput]
422pub fn hash_file(path: AnyPath) -> Result<String, Box<FetchDataError>> {
423    let mut sha256 = Sha256::new();
424    let mut file = File::open(path).map_err(|e| Box::new(e.into()))?;
425
426    std::io::copy(&mut file, &mut sha256).map_err(|e| Box::new(e.into()))?;
427    let hash_bytes = sha256.finalize();
428
429    let hex_hash = base16ct::lower::encode_string(&hash_bytes);
430    Ok(hex_hash)
431}
432
433/// Download a file from a URL.
434///
435/// # Example
436/// ```
437/// use fetch_data::download;
438/// use temp_testdir::TempDir;
439///
440/// // Create a temporary local directory.
441/// let temp_dir = TempDir::default();
442/// // Download a file to the temporary directory.
443/// let path = temp_dir.join("small.fam");
444/// download(
445///     "https://raw.githubusercontent.com/CarlKCarlK/fetch-data/main/tests/data/small.fam",
446///     &path,
447/// )?;
448/// assert!(path.exists());
449/// # use fetch_data::FetchDataError;
450/// # Ok::<(), Box<FetchDataError>>(())
451/// ```
452#[anyinput]
453pub fn download(url: AnyString, path: AnyPath) -> Result<(), Box<FetchDataError>> {
454    let req = ureq::get(url).call().map_err(|e| Box::new(e.into()))?;
455    let mut reader = req.into_reader();
456    let mut file = File::create(path).map_err(|e| Box::new(e.into()))?;
457    std::io::copy(&mut reader, &mut file).map_err(|e| Box::new(e.into()))?;
458    if !path.exists() {
459        return Err(Box::new(
460            FetchDataSpecificError::DownloadedFileNotSeen(path.display().to_string()).into(),
461        ));
462    }
463    Ok(())
464}
465
466fn hash_registry(registry_contents: &str) -> Result<HashMap<PathBuf, String>, Box<FetchDataError>> {
467    let mut hash_map = HashMap::new();
468    for line in registry_contents.lines() {
469        let mut parts = line.split_whitespace();
470
471        let url = if let Some(url) = parts.next() {
472            if url.is_empty() {
473                return Err(Box::new(FetchDataSpecificError::RegistryProblem().into()));
474            }
475            PathBuf::from(url)
476        } else {
477            return Err(Box::new(FetchDataSpecificError::RegistryProblem().into()));
478        };
479        let hash = if let Some(hash) = parts.next() {
480            hash.to_string()
481        } else {
482            return Err(Box::new(FetchDataSpecificError::RegistryProblem().into()));
483        };
484        if hash.is_empty() || parts.next().is_some() {
485            return Err(Box::new(FetchDataSpecificError::RegistryProblem().into()));
486        }
487
488        hash_map.insert(url, hash.clone());
489    }
490    Ok(hash_map)
491}
492
493/// List all the files in a local directory.
494///
495/// # Example
496/// ```
497/// use fetch_data::{dir_to_file_list, download};
498/// use temp_testdir::TempDir;
499///
500/// // Create a local directory and download two files to it.
501/// let temp_dir = TempDir::default();
502/// download(
503///     "https://raw.githubusercontent.com/CarlKCarlK/fetch-data/main/tests/data/small.fam",
504///     temp_dir.join("small.fam"),
505/// )?;
506/// download(
507///     "https://raw.githubusercontent.com/CarlKCarlK/fetch-data/main/tests/data/small.bim",
508///     temp_dir.join("small.bim"),
509/// )?;
510/// // List the files in the directory.
511/// let file_list = dir_to_file_list(temp_dir)?;
512/// println!("{file_list:?}"); // Prints ["small.bim", "small.fam"]
513/// # use fetch_data::FetchDataError;
514/// # Ok::<(), Box<FetchDataError>>(())
515/// ```
516#[anyinput]
517pub fn dir_to_file_list(path: AnyPath) -> Result<Vec<std::ffi::OsString>, Box<FetchDataError>> {
518    let file_list = read_dir(path)
519        .map_err(|e| Box::new(e.into()))?
520        .map(|res| res.map(|e| e.file_name()))
521        .collect::<Result<Vec<_>, std::io::Error>>()
522        .map_err(|e| Box::new(e.into()))?;
523    Ok(file_list)
524}
525struct Internals {
526    cache_dir: PathBuf,
527    hash_registry: HashMap<PathBuf, String>,
528    url_root: String,
529}
530
531impl Internals {
532    fn new(
533        registry_contents: &str,
534        url_root: &str,
535        env_key: &str,
536        qualifier: &str,
537        organization: &str,
538        application: &str,
539    ) -> Result<Self, Box<FetchDataError>> {
540        let cache_dir = Self::cache_dir(env_key, qualifier, organization, application)?;
541        let hash_registry = hash_registry(registry_contents)?;
542
543        Ok(Self {
544            cache_dir,
545            hash_registry,
546            url_root: url_root.to_string(),
547        })
548    }
549
550    fn cache_dir(
551        env_key: &str,
552        qualifier: &str,
553        organization: &str,
554        application: &str,
555    ) -> Result<PathBuf, Box<FetchDataError>> {
556        let cache_dir = if let Ok(cache_dir) = std::env::var(env_key) {
557            PathBuf::from(cache_dir)
558        } else if let Some(proj_dirs) = ProjectDirs::from(qualifier, organization, application) {
559            proj_dirs.cache_dir().to_owned()
560        } else {
561            return Err(Box::new(
562                FetchDataSpecificError::CannotCreateCacheDir().into(),
563            ));
564        };
565        if !cache_dir.exists() {
566            fs::create_dir_all(&cache_dir).map_err(|e| Box::new(e.into()))?;
567        }
568        Ok(cache_dir)
569    }
570}
571
572#[ctor]
573static STATIC_FETCH_DATA: FetchData = FetchData::new(
574    include_str!("../registry.txt"),
575    "https://raw.githubusercontent.com/CarlKCarlK/fetch-data/main/tests/data/",
576    "BAR_APP_DATA_DIR",
577    "com",
578    "Foo Corp",
579    "Bar App",
580);
581
582/// A sample sample_file. Don't use this. Instead, define your own `sample_file` function
583/// that knows how to fetch your data files.
584#[anyinput]
585pub fn sample_file(path: AnyPath) -> Result<PathBuf, Box<FetchDataError>> {
586    STATIC_FETCH_DATA.fetch_file(path)
587}