xtest_data/
lib.rs

1//! Fetch test data in packaged crate tests.
2//!
3//! # For crate authors
4//!
5//! Drop these lines into your _integration tests_ (due to a limitation in `cargo` this will only
6//! work in integration tests right now¹). Note that this requires your repository—through the URL
7//! contained in `Cargo.toml`—to be readable by the environment where you wish to test the packaged
8//! crate.
9//!
10//! ```rust
11//! use std::path::PathBuf;
12//!
13//! // or any other file you want to use.
14//! let mut datazip = PathBuf::from("tests/data.zip");
15//! xtest_data::setup!().rewrite([&mut datazip]).build();
16//!
17//! // … and the crate works its magic to make this succeed.
18//! assert!(datazip.exists(), "{}", datazip.display());
19//! ```
20//!
21//! # For packagers
22//!
23//! The `.crate` file you have downloaded is a `.tar.gz` in disguise. When you unpack it for your
24//! local build steps etc., verify that this package contains `Cargo.toml.orig` as well as a
25//! `.cargo_vcs_info.json` file; and that the latter file has git commit information.
26//!
27//! Then you can then run the tests:
28//!
29//! ```bash
30//! cargo test -- --nocapture
31//! ```
32//!
33//! Don't worry, this won't access the network yet.  In the first step it will only verify the
34//! basic installation. It will then panic while printing information on what it _would have_ done
35//! and instructions on how to proceed. You can opt into allow network access by default with:
36//!
37//! ```bash
38//! CARGO_XTEST_DATA_FETCH=yes cargo test -- --nocapture
39//! ```
40//!
41//! ¹We need a place to store a shallow clone of the crate's source repository.
42#![forbid(unsafe_code)]
43mod git;
44
45use std::{borrow::Cow, env, ffi::OsString, fs, io, path::Path, path::PathBuf};
46use tinyjson::JsonValue;
47
48/// A file or tree that was registered from [`Setup`].
49///
50/// This is a key into [`FsData`]. You can retrieve the local path using [`FsData::path()`]. The
51/// returned path is either the local path on disk, when you are currently developing under a local
52/// checkout of the version control system, or the path into which the file has been checked out.
53#[derive(Debug)]
54pub struct Files {
55    key: usize,
56}
57
58#[derive(Debug)]
59enum Managed {
60    // TODO: have a spec for the glob `<dir>/**.ext`?
61    Files(PathBuf),
62}
63
64type FsItem<'lt> = &'lt mut PathBuf;
65
66/// The product of `Setup`, ensuring local file system accessible test resources.
67///
68/// This object is used to retrieve the local paths of resources that have been registered with the
69/// method [`Setup::add()`].
70#[derive(Debug)]
71pub struct FsData {
72    /// Map all configured items to their paths.
73    /// This map will essentially be constant and we do not care about the VCS interpretation.
74    map: Vec<PathBuf>,
75}
76
77#[derive(Debug)]
78enum Source {
79    /// The data source is the crate's repository at a specific commit id.
80    VcsFromManifest {
81        /// TODO: we should support other commit identifiers.
82        commit_id: git::CommitId,
83        /// Evidence how we plan to access the source.
84        git: git::Git,
85        /// The directory where we may put git-dir and checkout of the resources.
86        datadir: PathBuf,
87    },
88    /// The data will be relative to the crate manifest.
89    Local(git::Git),
90}
91
92#[derive(Default, Debug)]
93struct Resources<'paths> {
94    /// All files and tree that are owned by the `Setup`.
95    /// Note: we never intend to remove anything from here. If we did we would have to do some kind
96    /// of remapping data structure to ensure that `Files` does not access the wrong item.
97    relative_files: Vec<Managed>,
98    /// Resources where we do 'simple' path replacement in a filter style.
99    ///
100    /// Note on ergonomics: We MAY take several different kinds of paths in the future to allow the
101    /// glob-style usage (`tests/samples/*.png`) to be efficiently executed. However, we should NOT
102    /// change the public API for this. We may well do some wrapping internally but the calls
103    /// should map to exactly one variant of any such item; and the enum variant should not be
104    /// directly exposed.
105    ///
106    /// This is based on the needs to perform more imports and additional calls to wrap locals in
107    /// those items. Basically, adding the crate should not be much more complex than making all
108    /// paths a variable and then throwing a `xtest_data::setup!()` on top.
109    unmanaged: Vec<FsItem<'paths>>,
110}
111
112/// A builder to configure desired test data paths.
113///
114/// This is created through [`setup!`] instead of a usual method as it must gather some information
115/// from the _callers_ environment first.
116///
117/// This is a builder and after configuration, its [`Setup::build()`] method should be called. Note
118/// the lifetime on this struct. This is either the lifetime of paths borrowed from the caller,
119/// which it will rewrite, or it can be `'static` when it owns all of the paths. The latter case
120/// requires them to be registered with [`Setup::add()`].
121///
122/// On a VCS copy of the surrounding package this will simply collect and validate the information,
123/// canonicalizing paths to be interpreted from the Manifest in the process.
124///
125/// However, when executed in the source tree from `.crate` then it will rewrite them all to refer
126/// to a local copy of the data instead. That is, if it is allowed to, since by default we merely
127/// provide a detailed report of data paths, repository location, and commit information that would
128/// _need_ to be fetched before aborting. When the environment has opted into our access of network
129/// (and might have overridden the repository path) then we will perform the actual access,
130/// checkout, and rewrite.
131#[must_use = "This is only a builder. Call `build` to perform validation/fetch/etc."]
132#[derive(Debug)]
133pub struct Setup<'paths> {
134    repository: OsString,
135    manifest: &'static str,
136    /// Have we determined to be local or in a crate?.
137    source: Source,
138    /// The resources that we store.
139    resources: Resources<'paths>,
140}
141
142/// The options determined from the compile time environment of the crate that called us.
143///
144/// This is every environment data we are gather from the `setup` macro, which allows us to get the
145/// environment flags passed to the _calling_ crate instead of our own. Please do not construct
146/// this directly since doing so could affect the integrity of the information.
147///
148/// This is independent from the data gathered from the _runtime_ environment. It is combined with
149/// that information in `Setup::build`.
150#[doc(hidden)]
151pub struct EnvOptions {
152    pub pkg_repository: &'static str,
153    pub manifest_dir: &'static str,
154    pub target_tmpdir: Option<&'static str>,
155}
156
157/// Create a builder to configure local test data.
158///
159/// This evaluates to an instance of [`Setup`].
160///
161/// This can be ran in _integration tests_ (and in integration tests only) to ensure that those can
162/// be replicated from a source distribution of the package, while actually using additional data
163/// stored in your repository. The commit ID of the head, stored inside the package, is used for
164/// bit-by-bit reproducibility of the test data.
165///
166/// You can rely on this package only using data within the git tree associated with the commit ID
167/// stored in the package. As a tester downstream, if the maintainer of the package signs their
168/// crates, and you validate that signature, then by extension and Git's content addressability all
169/// data is ensured to have been signed-off by the maintainer.
170///
171/// When developing locally this checks the plausibility of cargo data and then tries to determine
172/// if `git` is in use (other VCS are welcome but need to be supported by cargo first).
173///
174/// ## Panics
175///
176/// This function _panics_ if any of the following is true:
177/// * The function is called outside of an integration test.
178/// * There is no VCS in use.
179/// * We could not determine how to use the VCS of the repository.
180/// * The repository URL as configured in `Cargo.toml` is not valid.
181/// * We could not create a bare repository in the directory `${CARGO_TARGET_TMPDIR}`.
182///
183/// When executing from the distribution form of a package, we will also panic if any of the
184/// following are true:
185/// * The commit ID that is being read from `.cargo_vcs_info.json` can not be fetched from the
186///   remote repository.
187/// * There is no `.cargo_vcs_info.json` and the manifest is _not_ in a VCS folder.
188///
189/// Note that the eventual call to `build()` has some additional panics.
190#[macro_export]
191macro_rules! setup {
192    () => {
193        $crate::_setup($crate::EnvOptions {
194            // FIXME: technically this isn't critical information.
195            // We could rely on the user passing one to us since we will fail when that is not a
196            // git repository with the correct commit ID. That's just their fault.
197            pkg_repository: env!("CARGO_PKG_REPOSITORY"),
198            manifest_dir: env!("CARGO_MANIFEST_DIR"),
199            target_tmpdir: option_env!("CARGO_TARGET_TMPDIR"),
200        })
201    };
202}
203
204#[doc(hidden)]
205pub fn _setup(options: EnvOptions) -> Setup<'static> {
206    let EnvOptions {
207        pkg_repository: repository,
208        manifest_dir: manifest,
209        target_tmpdir: tmpdir,
210    } = options;
211    if repository.is_empty() {
212        inconclusive(&mut "The crate must have a valid URL in `package.repository`");
213    }
214
215    // Now allow the override.
216    let repository = env::var_os("CARGO_XTEST_DATA_REPOSITORY_ORIGIN")
217        .unwrap_or_else(|| OsString::from(repository));
218
219    // Make sure this is an integration test, or at least we have the dir.
220    // We don't want to block building over this (e.g. the crate itself here) but we _do_ want to
221    // restrict running this `setup` function
222    let integration_test_tempdir = tmpdir.map(Path::new);
223
224    let vcs_info_path = Path::new(manifest).join(".cargo_vcs_info.json");
225
226    let source = if vcs_info_path.exists() {
227        // Allow the override.
228        trait GetKey {
229            fn get_key(&self, key: &str) -> Option<&Self>;
230        }
231        impl GetKey for JsonValue {
232            fn get_key(&self, key: &str) -> Option<&Self> {
233                self.get::<std::collections::HashMap<_, _>>()?.get(key)
234            }
235        }
236
237        let data =
238            fs::read_to_string(vcs_info_path).unwrap_or_else(|mut err| inconclusive(&mut err));
239        let vcs: JsonValue = data
240            .parse()
241            .unwrap_or_else(|mut err| inconclusive(&mut err));
242        let commit_id = vcs
243            .get_key("git")
244            .unwrap_or_else(|| inconclusive(&mut "VCS does not contain a git section."))
245            .get_key("sha1")
246            .unwrap_or_else(|| inconclusive(&mut "VCS commit ID not recognized."))
247            .get::<String>()
248            .map(|id| git::CommitId::from(&**id))
249            .unwrap_or_else(|| inconclusive(&mut "VCS commit ID is not a string"));
250
251        // Okay, that makes sense. We know _what_ to access.
252        // Now let's also try to find out how we will access it. Let's find `git`.
253        // To shell out to because we are lazy.
254        let git = git::Git::new().unwrap_or_else(|mut err| inconclusive(&mut err));
255
256        let datadir = integration_test_tempdir
257            .map(Cow::Borrowed)
258            .or_else(|| {
259                    let environment_temp = std::env::var_os("CARGO_XTEST_DATA_TMPDIR")
260                        .or_else(|| std::env::var_os("TMPDIR"))
261                        .map(PathBuf::from)?;
262                    // TODO: nah, in this case we should have some distinguisher for the exact crate
263                    // name and version in the tmpdir. At least that would catch the gravest of errors
264                    // when testing many crates at the same time. (Although sharing the git dir would
265                    // be an advantage).
266                    Some(Cow::Owned(environment_temp))
267                })
268            .expect("This setup must only be called in an integration test or benchmark, or with an explicit TMPDIR")
269            .into_owned();
270
271        Source::VcsFromManifest {
272            commit_id,
273            git,
274            datadir,
275        }
276    } else {
277        // Check that we can recognize tracked files.
278        let git = git::Git::new().unwrap_or_else(|mut err| inconclusive(&mut err));
279        Source::Local(git)
280    };
281
282    // And finally this must be valid.
283    if repository.is_empty() {
284        inconclusive(&mut "The repository must have a valid URL");
285    }
286
287    Setup {
288        repository,
289        manifest,
290        source,
291        resources: Resources::default(),
292    }
293}
294
295impl<'lt> Setup<'lt> {
296    /// Register some paths to rewrite their location.
297    ///
298    /// The paths should be relative to the crate's manifest. For example, to refer to data in your
299    /// `tests` directory you would use `PathBuf::from("tests/data.zip")`.
300    ///
301    /// The paths will be registered internally. If the repository is local they will be rewritten
302    /// to be relative to the manifest location. If the repository is a crate distribution then the
303    /// paths will be sparsely checked out (meaning: only that path will be downloaded from the VCS
304    /// working dir and you can't expect any other files to be present).
305    ///
306    /// Those actions will happen when you call [`Setup::build()`].
307    ///
308    /// # Example
309    ///
310    /// ```
311    /// use std::path::PathBuf;
312    /// use xtest_data::setup;
313    ///
314    /// let mut path = PathBuf::from("tests/data.zip");
315    /// setup!().rewrite([&mut path]).build();
316    ///
317    /// assert!(path.exists(), "{}", path.display());
318    /// ```
319    pub fn rewrite(mut self, iter: impl IntoIterator<Item = &'lt mut PathBuf>) -> Self {
320        self.resources.unmanaged.extend(iter);
321        self
322    }
323
324    /// Register the path of a file or a tree of files.
325    ///
326    /// The return value is a key that can later be used in [`FsData`]. All the files under this
327    /// location will be checked out when `Setup::build()` is called in a crate-build.
328    ///
329    /// # Example
330    ///
331    /// ```
332    /// let mut vcs = xtest_data::setup!();
333    /// let datazip = vcs.add("tests/data.zip");
334    /// let testdata = vcs.build();
335    ///
336    /// let path = testdata.path(&datazip);
337    /// assert!(path.exists(), "{}", path.display());
338    /// ```
339
340    pub fn add(&mut self, path: impl AsRef<Path>) -> Files {
341        fn path_impl(resources: &mut Resources, path: &Path) -> usize {
342            let item = Managed::Files(path.to_owned());
343            let key = resources.relative_files.len();
344            resources.relative_files.push(item);
345            key
346        }
347
348        let key = path_impl(&mut self.resources, path.as_ref());
349        Files { key }
350    }
351
352    /// Run the final validation and perform rewrites.
353    ///
354    /// Returns the frozen dictionary of file mappings that had been registered with
355    /// [`Setup::add()`]. This allows retrieving the final data paths for those items.
356    ///
357    /// ## Panics
358    ///
359    /// This will panic if:
360    /// * Any registered file or tree is not tracked in the VCS.
361    /// * You have not allowed retrieving data from the VCS.
362    /// * It was not possible to retrieve the data from the VCS.
363    pub fn build(self) -> FsData {
364        let mut map;
365        match self.source {
366            Source::Local(git) => {
367                let dir = git::CrateDir::new(self.manifest, &git);
368                let datapath = Path::new(self.manifest);
369                dir.tracked(&git, &mut self.resources.path_specs());
370                map = vec![];
371                self.resources.relative_files.iter().for_each(|path| {
372                    map.push(datapath.join(path.as_path()));
373                });
374                self.resources
375                    .unmanaged
376                    .into_iter()
377                    .for_each(|item| set_root(datapath, item));
378            }
379            Source::VcsFromManifest {
380                commit_id,
381                datadir,
382                git,
383            } => {
384                let origin = git::Origin {
385                    url: self.repository,
386                };
387
388                let gitpath = datadir.join("xtest-data-git");
389                let datapath = unique_dir(&datadir, "xtest-data-tree")
390                    .unwrap_or_else(|mut err| inconclusive(&mut err));
391
392                git.consent_to_use(
393                    &gitpath,
394                    &datapath,
395                    &origin,
396                    &commit_id,
397                    &mut self.resources.as_paths(),
398                    &mut self.resources.path_specs(),
399                );
400
401                let shallow = git.shallow_clone(gitpath, origin);
402
403                shallow.fetch(&git, &commit_id);
404                shallow.checkout(
405                    &git,
406                    &datapath,
407                    &commit_id,
408                    &mut self.resources.path_specs(),
409                );
410                map = vec![];
411                self.resources.relative_files.iter().for_each(|path| {
412                    map.push(datapath.join(path.as_path()));
413                });
414                self.resources
415                    .unmanaged
416                    .into_iter()
417                    .for_each(|item| set_root(&datapath, item));
418            }
419        }
420
421        // In the end we just discard some information.
422        // We don't really need it anymore after the checks.
423        //
424        // TODO: of course we could avoid actually checking files onto the disk if we had some kind
425        // of `io::Read` abstraction that read them straight from `git cat` instead. But chances
426        // are you'll like your files and directory structures.
427        FsData { map }
428    }
429}
430
431impl Resources<'_> {
432    pub fn as_paths(&self) -> impl Iterator<Item = &'_ Path> {
433        let values = self.relative_files.iter().map(Managed::as_path);
434        let unmanaged = self.unmanaged.iter().map(|x| Path::new(x));
435        values.chain(unmanaged)
436    }
437
438    pub fn path_specs(&self) -> impl Iterator<Item = git::PathSpec<'_>> {
439        let values = self.relative_files.iter().map(Managed::as_path_spec);
440        let unmanaged = self.unmanaged.iter().map(|x| git::PathSpec::Path(&**x));
441        values.chain(unmanaged)
442    }
443}
444
445impl FsData {
446    /// Retrieve the rewritten path of a file or tree of files.
447    pub fn path(&self, file: &Files) -> &Path {
448        self.map.get(file.key).unwrap().as_path()
449    }
450}
451
452impl Managed {
453    pub fn as_path(&self) -> &Path {
454        match self {
455            Managed::Files(path) => path,
456        }
457    }
458
459    fn as_path_spec(&self) -> git::PathSpec<'_> {
460        match self {
461            Managed::Files(path) => git::PathSpec::Path(path),
462        }
463    }
464}
465
466fn set_root(path: &Path, dir: &mut PathBuf) {
467    *dir = path.join(&*dir)
468}
469
470// We do not use tempdir. This should already be done by our environment (e.g. cargo).
471fn unique_dir(base: &Path, prefix: &str) -> Result<PathBuf, std::io::Error> {
472    let mut rng = nanorand::tls::tls_rng();
473    assert!(matches!(
474        Path::new(prefix).components().next(),
475        Some(std::path::Component::Normal(_))
476    ));
477    assert!(Path::new(prefix).components().nth(1).is_none());
478
479    let mut buffer = prefix.to_string();
480    let mut generate_name = move || -> PathBuf {
481        use nanorand::Rng;
482        const TABLE: &str = "0123456789abcdef";
483        let num: [u8; 8] = rng.rand();
484
485        buffer.clear();
486        buffer.push_str(prefix);
487
488        for byte in num {
489            let (low, hi) = (usize::from(byte & 0xf), usize::from((byte >> 4) & 0xf));
490            buffer.push_str(&TABLE[low..low + 1]);
491            buffer.push_str(&TABLE[hi..hi + 1]);
492        }
493
494        base.join(&buffer)
495    };
496
497    loop {
498        let path = generate_name();
499        match fs::create_dir(&path) {
500            Ok(_) => return Ok(path),
501            Err(err) if err.kind() == io::ErrorKind::AlreadyExists => {}
502            Err(other) => return Err(other),
503        }
504    }
505}
506
507#[cold]
508fn inconclusive(err: &mut dyn std::fmt::Display) -> ! {
509    eprintln!("xtest-data failed to setup.");
510    eprintln!("Information: {}", err);
511    panic!();
512}