fetch_data/lib.rs
1#![warn(clippy::all)]
2#![warn(clippy::pedantic)]
3#![warn(clippy::nursery)]
4#![warn(missing_docs)]
5#![allow(clippy::missing_errors_doc)]
6#![doc = include_str!("../README.md")]
7
8use anyinput::anyinput;
9/// Used to construct global `FetchData` instance.
10///
11/// This is a re-export from crate [`ctor`](https://crates.io/crates/ctor).
12pub use ctor::ctor;
13use directories::ProjectDirs;
14
15use sha2::{Digest, Sha256};
16use std::{
17 collections::HashMap,
18 fs::{self, read_dir, File},
19 path::PathBuf,
20 sync::Mutex,
21};
22use thiserror::Error;
23
24/// Used to fetch data files from a URL, if needed. It verifies file contents via a hash.
25///
26/// # Thread Safety
27///
28/// `FetchData` works well with multithreaded testing, It is thread safe (via a Mutex).
29///
30pub struct FetchData {
31 mutex: Mutex<Result<Internals, Box<FetchDataError>>>,
32}
33
34impl FetchData {
35 /// Create a new FetchData object.
36 ///
37 /// # Errors
38 ///
39 /// To make `FetchData` work well as a static global, `new` never fails. Instead, `FetchData` stores any error
40 /// and returns it when the first call to `fetch_file`, etc., is made.
41 ///
42 /// # Arguments
43 /// *all inputs are string-like*
44 ///
45 /// * `registry_contents` - Whitespace delimited list of files and hashes.
46 /// Use Rust's [`std::include_str`](https://doc.rust-lang.org/std/macro.include_str.html)
47 /// macro to include the contents of a file.
48 /// * `url_root` - Base URL for remote files.
49 /// * `env_key` - Environment variable that may contain the path to the data directory.
50 /// If not set, the data directory will be create via
51 /// [`ProjectDirs`](https://docs.rs/directories/latest/directories/struct.ProjectDirs.html#method.from_path)
52 /// and the next three arguments.
53 /// * `qualifier` - The reverse domain name notation of the application, excluding the organization or application name itself.
54 /// * `organization` - The name of the organization that develops this application.
55 /// * `application` - The name of the application itself.
56 ///
57 /// # Example
58 /// ```
59 /// use fetch_data::{FetchData};
60 ///
61 /// // Create a new FetchData instance.
62 /// let fetch_data = FetchData::new(
63 /// "small.fam 36e0086c0353ff336d0533330dbacb12c75e37dc3cba174313635b98dfe86ed2
64 /// small.bim 56b6657a3766e2e52273f89d28be6135f9424ca1d204d29f3fa1c5a90eca794e",
65 /// "https://raw.githubusercontent.com/CarlKCarlK/fetch-data/main/tests/data/",
66 /// "BAR_APP_DATA_DIR",
67 /// "com",
68 /// "Foo Corp",
69 /// "Bar App",
70 /// );
71 ///
72 /// // If the local file exists and has the right hash, just return its path.
73 /// // Otherwise, download the file, confirm its hash, and return its path.
74 /// let local_path = fetch_data.fetch_file("small.bim")?;
75 /// assert!(local_path.exists());
76 /// # use fetch_data::FetchDataError;
77 /// # Ok::<(), Box<FetchDataError>>(())
78 /// ```
79 #[anyinput]
80 pub fn new(
81 registry_contents: AnyString,
82 url_root: AnyString,
83 env_key: AnyString,
84 qualifier: AnyString,
85 organization: AnyString,
86 application: AnyString,
87 ) -> Self {
88 Self {
89 mutex: Mutex::new(Internals::new(
90 registry_contents,
91 url_root,
92 env_key,
93 qualifier,
94 organization,
95 application,
96 )),
97 }
98 }
99
100 fn lock(&self) -> std::sync::MutexGuard<Result<Internals, Box<FetchDataError>>> {
101 match self.mutex.lock() {
102 Ok(lock) => lock,
103 Err(err) => err.into_inner(),
104 }
105 }
106
107 /// Fetch data files from a URL, but only if needed. Verify contents via a hash.
108 ///
109 /// # Example
110 /// ```
111 /// use fetch_data::{FetchData};
112 ///
113 /// // Create a new FetchData object.
114 /// let fetch_data = FetchData::new(
115 /// "small.fam 36e0086c0353ff336d0533330dbacb12c75e37dc3cba174313635b98dfe86ed2
116 /// small.bim 56b6657a3766e2e52273f89d28be6135f9424ca1d204d29f3fa1c5a90eca794e",
117 /// "https://raw.githubusercontent.com/CarlKCarlK/fetch-data/main/tests/data/",
118 /// "BAR_APP_DATA_DIR",
119 /// "com",
120 /// "Foo Corp",
121 /// "Bar App",
122 /// );
123 ///
124 /// // If the local file exists and has the right hash, just return its path.
125 /// // Otherwise, download the file, confirm its hash, and return its path.
126 /// let local_path = fetch_data.fetch_file("small.bim")?;
127 /// assert!(local_path.exists());
128 /// # use fetch_data::FetchDataError;
129 /// # Ok::<(), Box<FetchDataError>>(())
130 /// ```
131 #[anyinput]
132 pub fn fetch_file(&self, path: AnyPath) -> Result<PathBuf, Box<FetchDataError>> {
133 let path_list = vec![path.to_path_buf()];
134 let vec = self.fetch_files(path_list)?;
135 Ok(vec[0].clone())
136 }
137
138 /// Given a list of files, returns a list of their local paths. If necessary, the files will be downloaded.
139 ///
140 /// # Example
141 /// ```
142 /// use fetch_data::{FetchData};
143 ///
144 /// // Create a new FetchData instance.
145 /// let fetch_data = FetchData::new(
146 /// "small.fam 36e0086c0353ff336d0533330dbacb12c75e37dc3cba174313635b98dfe86ed2
147 /// small.bim 56b6657a3766e2e52273f89d28be6135f9424ca1d204d29f3fa1c5a90eca794e",
148 /// "https://raw.githubusercontent.com/CarlKCarlK/fetch-data/main/tests/data/",
149 /// "BAR_APP_DATA_DIR",
150 /// "com",
151 /// "Foo Corp",
152 /// "Bar App",
153 /// );
154 ///
155 /// // If a local file exists and has the right hash, just return its path
156 /// // in a list. Otherwise, download the file, confirm its hash, and return
157 /// // its path in the list.
158 /// let local_path_list = fetch_data.fetch_files(["small.bim", "small.bim"])?;
159 /// assert!(local_path_list[0].exists() && local_path_list[1].exists());
160 /// # use fetch_data::FetchDataError;
161 /// # Ok::<(), Box<FetchDataError>>(())
162 /// ```
163 #[anyinput]
164 #[allow(clippy::significant_drop_tightening)]
165 pub fn fetch_files(
166 &self,
167 path_list: AnyIter<AnyPath>,
168 ) -> Result<Vec<PathBuf>, Box<FetchDataError>> {
169 let lock = self.lock();
170 // Convert Result to reference the error inside the Box for compatibility with internals()
171 let lock_ref = lock.as_ref().map_err(|e| &**e);
172 let internals = Self::internals(lock_ref)?;
173 let hash_registry = &internals.hash_registry;
174 let cache_dir = &internals.cache_dir;
175 let url_root = &internals.url_root;
176
177 let mut local_list: Vec<PathBuf> = Vec::new();
178 for path in path_list {
179 let path = path.as_ref();
180
181 let path_as_string = path.to_str().ok_or_else(|| {
182 Box::new(FetchDataSpecificError::UnknownOrBadFile("???".to_string()).into())
183 })?;
184
185 let Some(hash) = hash_registry.get(path) else {
186 return Err(Box::new(
187 FetchDataSpecificError::UnknownOrBadFile(path_as_string.to_string()).into(),
188 ));
189 };
190
191 let local_path = cache_dir.join(path);
192 let url = format!("{url_root}{path_as_string}");
193 fetch(url, hash, &local_path)?;
194 local_list.push(local_path);
195 }
196
197 Ok(local_list)
198 }
199
200 fn internals<'a>(
201 lock_ref: Result<&'a Internals, &FetchDataError>,
202 ) -> Result<&'a Internals, Box<FetchDataError>> {
203 match lock_ref {
204 Ok(internals) => Ok(internals),
205 Err(e) => Err(Box::new(
206 FetchDataSpecificError::FetchDataNewFailed(e.to_string()).into(),
207 )),
208 }
209 }
210 /// Compute registry contents by downloading items and hashing them.
211 ///
212 /// # Tips
213 ///
214 /// * If you put the returned contents into a file, you can use Rust's [`std::include_str`](https://doc.rust-lang.org/std/macro.include_str.html)
215 /// macro to include the contents of that file in [`FetchData::new`](struct.FetchData.html#method.new).
216 ///
217 /// * Use utility function [`fetch_data::dir_to_file_list`](fn.dir_to_file_list.html) to create a list of files in any local directory.
218 /// Note the hash is computed on download files, not any original local files.
219 ///
220 /// # Example
221 ///
222 /// ```
223 /// use fetch_data::{FetchData};
224 ///
225 /// // Create a new FetchData object.
226 /// let fetch_data = FetchData::new(
227 /// "", // ignored
228 /// "https://raw.githubusercontent.com/CarlKCarlK/fetch-data/main/tests/data/",
229 /// "BAR_APP_DATA_DIR",
230 /// "com",
231 /// "Foo Corp",
232 /// "Bar App",
233 /// );
234 ///
235 /// // Even if local files exist, download each file. Hash each file. Return the results as a string.
236 /// let registry_contents = fetch_data.gen_registry_contents(["small.fam", "small.bim"])?;
237 /// println!("{registry_contents}"); // Prints:
238 /// // small.fam 36e0086c0353ff336d0533330dbacb12c75e37dc3cba174313635b98dfe86ed2
239 /// // small.bim 56b6657a3766e2e52273f89d28be6135f9424ca1d204d29f3fa1c5a90eca794e
240 /// # use fetch_data::FetchDataError;
241 /// # Ok::<(), Box<FetchDataError>>(())
242 /// ```
243 #[anyinput]
244 #[allow(clippy::significant_drop_tightening)]
245 pub fn gen_registry_contents(
246 &self,
247 path_list: AnyIter<AnyPath>,
248 ) -> Result<String, Box<FetchDataError>> {
249 let lock = self.lock();
250 // Convert Result to reference the error inside the Box for compatibility with internals()
251 let lock_ref = lock.as_ref().map_err(|e| &**e);
252 let internals = Self::internals(lock_ref)?;
253 let cache_dir = &internals.cache_dir;
254 let url_root = &internals.url_root;
255
256 let mut s = String::new();
257 for path in path_list {
258 let path = path.as_ref();
259
260 let Some(path_as_string) = path.to_str() else {
261 return Err(Box::new(
262 FetchDataSpecificError::UnknownOrBadFile("???".to_string()).into(),
263 ));
264 };
265
266 let local_path = cache_dir.join(path);
267 let url = format!("{url_root}{path_as_string}");
268 download(url, &local_path)?;
269 let hash = hash_file(&local_path)?;
270 s.push_str(&format!("{} {hash}\n", path.display()));
271 }
272
273 Ok(s)
274 }
275
276 /// Return the path to the local cache directory.
277 #[allow(clippy::significant_drop_tightening)]
278 pub fn cache_dir(&self) -> Result<PathBuf, Box<FetchDataError>> {
279 let lock = self.lock();
280 // Convert Result to reference the error inside the Box for compatibility with internals()
281 let lock_ref = lock.as_ref().map_err(|e| &**e);
282 let internals = Self::internals(lock_ref)?;
283 let cache_dir = &internals.cache_dir;
284 Ok(cache_dir.to_owned())
285 }
286}
287
288/// All possible errors returned by this crate and the crates it depends on.
289// Based on `<https://nick.groenen.me/posts/rust-error-handling/#the-library-error-type>`
290#[derive(Error, Debug)]
291pub enum FetchDataError {
292 #[allow(missing_docs)]
293 #[error(transparent)]
294 FetchDataError(#[from] FetchDataSpecificError),
295
296 #[allow(missing_docs)]
297 #[error(transparent)]
298 IOError(#[from] std::io::Error),
299
300 #[allow(missing_docs)]
301 #[error(transparent)]
302 UreqError(#[from] ureq::Error),
303}
304/// All errors specific to this crate.
305#[derive(Error, Debug, Clone)]
306pub enum FetchDataSpecificError {
307 #[allow(missing_docs)]
308 #[error("Unknown or bad file '{0}'")]
309 UnknownOrBadFile(String),
310
311 #[allow(missing_docs)]
312 #[error("The registry of files is invalid")]
313 RegistryProblem(),
314
315 #[allow(missing_docs)]
316 #[error("FetchData new failed with error: {0}")]
317 FetchDataNewFailed(String),
318
319 #[allow(missing_docs)]
320 #[error("Downloaded file not seen: {0}")]
321 DownloadedFileNotSeen(String),
322
323 #[allow(missing_docs)]
324 #[error("Downloaded file has wrong hash: {0},expected: {1}, actual: {2}")]
325 DownloadedFileWrongHash(String, String, String),
326
327 #[allow(missing_docs)]
328 #[error("Cannot create cache directory")]
329 CannotCreateCacheDir(),
330}
331
332/// If necessary, retrieve a file from a URL, checking its hash.
333/// # Example
334/// ```
335/// use fetch_data::fetch;
336/// use temp_testdir::TempDir;
337///
338/// // Create a temporary local directory.
339/// let temp_dir = TempDir::default();
340/// // Download the file and check its hash.
341/// let path = temp_dir.join("small.fam");
342/// fetch(
343/// "https://raw.githubusercontent.com/CarlKCarlK/fetch-data/main/tests/data/small.fam",
344/// "36e0086c0353ff336d0533330dbacb12c75e37dc3cba174313635b98dfe86ed2",
345/// &path,
346/// )?;
347/// assert!(&path.exists());
348/// // This time, because the local file exists and has the correct hash, no download is performed.
349/// fetch(
350/// "https://raw.githubusercontent.com/CarlKCarlK/fetch-data/main/tests/data/small.fam",
351/// "36e0086c0353ff336d0533330dbacb12c75e37dc3cba174313635b98dfe86ed2",
352/// &path,
353/// )?;
354/// assert!(&path.exists());
355/// # use fetch_data::FetchDataError;
356/// # Ok::<(), Box<FetchDataError>>(())
357/// ```
358#[anyinput]
359pub fn fetch(url: AnyString, hash: AnyString, path: AnyPath) -> Result<(), Box<FetchDataError>> {
360 if !path.exists() {
361 download(url, path)?;
362 }
363 let actual_hash = hash_file(path)?;
364 if !actual_hash.eq(hash) {
365 return Err(Box::new(
366 FetchDataSpecificError::DownloadedFileWrongHash(
367 path.display().to_string(),
368 hash.to_string(),
369 actual_hash,
370 )
371 .into(),
372 ));
373 }
374 Ok(())
375}
376
377/// Download a file from a URL and compute its hash.
378///
379/// # Example
380/// ```
381/// use fetch_data::hash_download;
382/// use temp_testdir::TempDir;
383///
384/// // Create a temporary local directory.
385/// let temp_dir = TempDir::default();
386/// let path = temp_dir.join("small.fam");
387/// // Download a file and compute its hash.
388/// let hash = hash_download(
389/// "https://raw.githubusercontent.com/CarlKCarlK/fetch-data/main/tests/data/small.fam",
390/// &path,
391/// )?;
392/// assert!(hash.eq("36e0086c0353ff336d0533330dbacb12c75e37dc3cba174313635b98dfe86ed2"));
393/// # use fetch_data::FetchDataError;
394/// # Ok::<(), Box<FetchDataError>>(())
395/// ```
396#[anyinput]
397pub fn hash_download(url: AnyString, path: AnyPath) -> Result<String, Box<FetchDataError>> {
398 download(url, path)?;
399 hash_file(path)
400}
401
402/// Compute the hash (SHA256) of a local file.
403///
404/// # Example
405/// ```
406/// use fetch_data::{hash_file, download};
407/// use temp_testdir::TempDir;
408///
409/// // Download a file to a temporary directory.
410/// let temp_dir = TempDir::default();
411/// let path = temp_dir.join("small.fam");
412/// download(
413/// "https://raw.githubusercontent.com/CarlKCarlK/fetch-data/main/tests/data/small.fam",
414/// &path,
415/// )?;
416/// // Compute the hash of the file.
417/// let hash = hash_file(&path)?;
418/// assert!(hash.eq("36e0086c0353ff336d0533330dbacb12c75e37dc3cba174313635b98dfe86ed2"));
419/// # use fetch_data::FetchDataError;
420/// # Ok::<(), Box<FetchDataError>>(())
421#[anyinput]
422pub fn hash_file(path: AnyPath) -> Result<String, Box<FetchDataError>> {
423 let mut sha256 = Sha256::new();
424 let mut file = File::open(path).map_err(|e| Box::new(e.into()))?;
425
426 std::io::copy(&mut file, &mut sha256).map_err(|e| Box::new(e.into()))?;
427 let hash_bytes = sha256.finalize();
428
429 let hex_hash = base16ct::lower::encode_string(&hash_bytes);
430 Ok(hex_hash)
431}
432
433/// Download a file from a URL.
434///
435/// # Example
436/// ```
437/// use fetch_data::download;
438/// use temp_testdir::TempDir;
439///
440/// // Create a temporary local directory.
441/// let temp_dir = TempDir::default();
442/// // Download a file to the temporary directory.
443/// let path = temp_dir.join("small.fam");
444/// download(
445/// "https://raw.githubusercontent.com/CarlKCarlK/fetch-data/main/tests/data/small.fam",
446/// &path,
447/// )?;
448/// assert!(path.exists());
449/// # use fetch_data::FetchDataError;
450/// # Ok::<(), Box<FetchDataError>>(())
451/// ```
452#[anyinput]
453pub fn download(url: AnyString, path: AnyPath) -> Result<(), Box<FetchDataError>> {
454 let req = ureq::get(url).call().map_err(|e| Box::new(e.into()))?;
455 let mut reader = req.into_reader();
456 let mut file = File::create(path).map_err(|e| Box::new(e.into()))?;
457 std::io::copy(&mut reader, &mut file).map_err(|e| Box::new(e.into()))?;
458 if !path.exists() {
459 return Err(Box::new(
460 FetchDataSpecificError::DownloadedFileNotSeen(path.display().to_string()).into(),
461 ));
462 }
463 Ok(())
464}
465
466fn hash_registry(registry_contents: &str) -> Result<HashMap<PathBuf, String>, Box<FetchDataError>> {
467 let mut hash_map = HashMap::new();
468 for line in registry_contents.lines() {
469 let mut parts = line.split_whitespace();
470
471 let url = if let Some(url) = parts.next() {
472 if url.is_empty() {
473 return Err(Box::new(FetchDataSpecificError::RegistryProblem().into()));
474 }
475 PathBuf::from(url)
476 } else {
477 return Err(Box::new(FetchDataSpecificError::RegistryProblem().into()));
478 };
479 let hash = if let Some(hash) = parts.next() {
480 hash.to_string()
481 } else {
482 return Err(Box::new(FetchDataSpecificError::RegistryProblem().into()));
483 };
484 if hash.is_empty() || parts.next().is_some() {
485 return Err(Box::new(FetchDataSpecificError::RegistryProblem().into()));
486 }
487
488 hash_map.insert(url, hash.clone());
489 }
490 Ok(hash_map)
491}
492
493/// List all the files in a local directory.
494///
495/// # Example
496/// ```
497/// use fetch_data::{dir_to_file_list, download};
498/// use temp_testdir::TempDir;
499///
500/// // Create a local directory and download two files to it.
501/// let temp_dir = TempDir::default();
502/// download(
503/// "https://raw.githubusercontent.com/CarlKCarlK/fetch-data/main/tests/data/small.fam",
504/// temp_dir.join("small.fam"),
505/// )?;
506/// download(
507/// "https://raw.githubusercontent.com/CarlKCarlK/fetch-data/main/tests/data/small.bim",
508/// temp_dir.join("small.bim"),
509/// )?;
510/// // List the files in the directory.
511/// let file_list = dir_to_file_list(temp_dir)?;
512/// println!("{file_list:?}"); // Prints ["small.bim", "small.fam"]
513/// # use fetch_data::FetchDataError;
514/// # Ok::<(), Box<FetchDataError>>(())
515/// ```
516#[anyinput]
517pub fn dir_to_file_list(path: AnyPath) -> Result<Vec<std::ffi::OsString>, Box<FetchDataError>> {
518 let file_list = read_dir(path)
519 .map_err(|e| Box::new(e.into()))?
520 .map(|res| res.map(|e| e.file_name()))
521 .collect::<Result<Vec<_>, std::io::Error>>()
522 .map_err(|e| Box::new(e.into()))?;
523 Ok(file_list)
524}
525struct Internals {
526 cache_dir: PathBuf,
527 hash_registry: HashMap<PathBuf, String>,
528 url_root: String,
529}
530
531impl Internals {
532 fn new(
533 registry_contents: &str,
534 url_root: &str,
535 env_key: &str,
536 qualifier: &str,
537 organization: &str,
538 application: &str,
539 ) -> Result<Self, Box<FetchDataError>> {
540 let cache_dir = Self::cache_dir(env_key, qualifier, organization, application)?;
541 let hash_registry = hash_registry(registry_contents)?;
542
543 Ok(Self {
544 cache_dir,
545 hash_registry,
546 url_root: url_root.to_string(),
547 })
548 }
549
550 fn cache_dir(
551 env_key: &str,
552 qualifier: &str,
553 organization: &str,
554 application: &str,
555 ) -> Result<PathBuf, Box<FetchDataError>> {
556 let cache_dir = if let Ok(cache_dir) = std::env::var(env_key) {
557 PathBuf::from(cache_dir)
558 } else if let Some(proj_dirs) = ProjectDirs::from(qualifier, organization, application) {
559 proj_dirs.cache_dir().to_owned()
560 } else {
561 return Err(Box::new(
562 FetchDataSpecificError::CannotCreateCacheDir().into(),
563 ));
564 };
565 if !cache_dir.exists() {
566 fs::create_dir_all(&cache_dir).map_err(|e| Box::new(e.into()))?;
567 }
568 Ok(cache_dir)
569 }
570}
571
572#[ctor]
573static STATIC_FETCH_DATA: FetchData = FetchData::new(
574 include_str!("../registry.txt"),
575 "https://raw.githubusercontent.com/CarlKCarlK/fetch-data/main/tests/data/",
576 "BAR_APP_DATA_DIR",
577 "com",
578 "Foo Corp",
579 "Bar App",
580);
581
582/// A sample sample_file. Don't use this. Instead, define your own `sample_file` function
583/// that knows how to fetch your data files.
584#[anyinput]
585pub fn sample_file(path: AnyPath) -> Result<PathBuf, Box<FetchDataError>> {
586 STATIC_FETCH_DATA.fetch_file(path)
587}