xtest_data/
lib.rs

1//! Fetch test data in packaged crate tests.
2//!
3//! # For crate authors
4//!
5//! Drop these lines into your _integration tests_ (due to a limitation in `cargo` this will only
6//! work in integration tests right now¹). Note that this requires your repository—through the URL
7//! contained in `Cargo.toml`—to be readable by the environment where you wish to test the packaged
8//! crate.
9//!
10//! ```rust
11//! use std::path::PathBuf;
12//!
13//! // or any other file you want to use.
14//! let mut datazip = PathBuf::from("tests/data.zip");
15//! xtest_data::setup!().rewrite([&mut datazip]).build();
16//!
17//! // … and the crate works its magic to make this succeed.
18//! assert!(datazip.exists(), "{}", datazip.display());
19//! ```
20//!
21//! ¹The crate uses a directory to store a shallow clone of the source repository. Only integration
22//! tests have the environment variable that cargo uses to communicate a temporary directory within
23//! its `target` folder. That temporary directory is required for our choice.
24//!
25//! Configure meta data in your `Cargo.toml` according to the guide to inform packagers where they
26//! will find the test data. See the [Readme](crate::readme) in the sources for an example. When
27//! you're ready to deploy, let the xtest-data binary pack all necessary data files from your
28//! source tree:
29//!
30//! ```bash
31//! cargo xtest-data pack
32//! ```
33//!
34//! Make sure to upload the archive according to your published configuration!
35//!
36//! # For packagers
37//!
38//! ```bash
39//! cargo install xtest-data --features=bin-xtask
40//! ```
41//!
42//! Any `.crate` file you have downloaded is a `.tar.gz` in disguise. When you unpack it for your
43//! local build steps etc., you may verify that this package contains `Cargo.toml.orig` as well as
44//! a `.cargo_vcs_info.json` file; and that the latter file has git commit information.
45//!
46//! The binary orchestrates fetching a shallow pack of the requested data sources from the upstream
47//! repository, or consuming one from your local filesystem if you rather do the networking
48//! yourself. The basic structure you want is:
49//!
50//! ```bash
51//! cargo xtest-data test-crate /path/to/your.crate [--pack-artifact /path/to/pack-artifact]
52//! ```
53#![forbid(unsafe_code)]
54mod git;
55
56use std::{borrow::Cow, env, ffi::OsString, fs, io, path::Path, path::PathBuf};
57use tinyjson::JsonValue;
58
59#[cfg(doc)]
60/// Find the Readme and further documentation here, only present in the docs build.
61#[doc = include_str!("../Readme.md")]
62pub mod readme {}
63
64/// A file or tree that was registered from [`Setup`].
65///
66/// This is a key into [`FsData`]. You can retrieve the local path using [`FsData::path()`]. The
67/// returned path is either the local path on disk, when you are currently developing under a local
68/// checkout of the version control system, or the path into which the file has been checked out.
69#[derive(Debug)]
70pub struct Files {
71    key: usize,
72}
73
74#[derive(Debug)]
75enum Managed {
76    // TODO: have a spec for the glob `<dir>/**.ext`?
77    Files(PathBuf),
78}
79
80type FsItem<'lt> = &'lt mut PathBuf;
81
82/// The product of `Setup`, ensuring local file system accessible test resources.
83///
84/// This object is used to retrieve the local paths of resources that have been registered with the
85/// method [`Setup::add()`].
86#[derive(Debug)]
87pub struct FsData {
88    /// Map all configured items to their paths.
89    /// This map will essentially be constant and we do not care about the VCS interpretation.
90    map: Vec<PathBuf>,
91}
92
93#[derive(Debug)]
94enum Source {
95    /// The data source is the crate's repository at a specific commit id.
96    VcsFromManifest {
97        /// TODO: we should support other commit identifiers.
98        commit_id: git::CommitId,
99        /// Evidence how we plan to access the source.
100        git: git::Git,
101        /// The directory where we may put git-dir and checkout of the resources.
102        datadir: PathBuf,
103    },
104    /// The data will be relative to the crate manifest.
105    Local(git::Git),
106}
107
108#[derive(Default, Debug)]
109struct Resources<'paths> {
110    /// All files and tree that are owned by the `Setup`.
111    /// Note: we never intend to remove anything from here. If we did we would have to do some kind
112    /// of remapping data structure to ensure that `Files` does not access the wrong item.
113    relative_files: Vec<Managed>,
114    /// Resources where we do 'simple' path replacement in a filter style.
115    ///
116    /// Note on ergonomics: We MAY take several different kinds of paths in the future to allow the
117    /// glob-style usage (`tests/samples/*.png`) to be efficiently executed. However, we should NOT
118    /// change the public API for this. We may well do some wrapping internally but the calls
119    /// should map to exactly one variant of any such item; and the enum variant should not be
120    /// directly exposed.
121    ///
122    /// This is based on the needs to perform more imports and additional calls to wrap locals in
123    /// those items. Basically, adding the crate should not be much more complex than making all
124    /// paths a variable and then throwing a `xtest_data::setup!()` on top.
125    unmanaged: Vec<FsItem<'paths>>,
126}
127
128/// A builder to configure desired test data paths.
129///
130/// This is created through [`setup!`] instead of a usual method as it must gather some information
131/// from the _callers_ environment first.
132///
133/// This is a builder and after configuration, its [`Setup::build()`] method should be called. Note
134/// the lifetime on this struct. This is either the lifetime of paths borrowed from the caller,
135/// which it will rewrite, or it can be `'static` when it owns all of the paths. The latter case
136/// requires them to be registered with [`Setup::add()`].
137///
138/// On a VCS copy of the surrounding package this will simply collect and validate the information,
139/// canonicalizing paths to be interpreted from the Manifest in the process.
140///
141/// However, when executed in the source tree from `.crate` then it will rewrite them all to refer
142/// to a local copy of the data instead. That is, if it is allowed to, since by default we merely
143/// provide a detailed report of data paths, repository location, and commit information that would
144/// _need_ to be fetched before aborting. When the environment has opted into our access of network
145/// (and might have overridden the repository path) then we will perform the actual access,
146/// checkout, and rewrite.
147#[must_use = "This is only a builder. Call `build` to perform validation/fetch/etc."]
148#[derive(Debug)]
149pub struct Setup<'paths> {
150    repository: OsString,
151    manifest: &'static str,
152    /// Have we determined to be local or in a crate?.
153    source: Source,
154    /// The resources that we store.
155    resources: Resources<'paths>,
156    /// A git pack archive with files.
157    pack_objects: Option<OsString>,
158}
159
160/// The options determined from the compile time environment of the crate that called us.
161///
162/// This is every environment data we are gather from the `setup` macro, which allows us to get the
163/// environment flags passed to the _calling_ crate instead of our own. Please do not construct
164/// this directly since doing so could affect the integrity of the information.
165///
166/// This is independent from the data gathered from the _runtime_ environment. It is combined with
167/// that information in `Setup::build`.
168#[doc(hidden)]
169pub struct EnvOptions {
170    pub pkg_repository: &'static str,
171    pub manifest_dir: &'static str,
172    pub target_tmpdir: Option<&'static str>,
173}
174
175/// Create a builder to configure local test data.
176///
177/// This evaluates to an instance of [`Setup`].
178///
179/// This can be ran in _integration tests_ (and in integration tests only) to ensure that those can
180/// be replicated from a source distribution of the package, while actually using additional data
181/// stored in your repository. The commit ID of the head, stored inside the package, is used for
182/// bit-by-bit reproducibility of the test data.
183///
184/// You can rely on this package only using data within the git tree associated with the commit ID
185/// stored in the package. As a tester downstream, if the maintainer of the package signs their
186/// crates, and you validate that signature, then by extension and Git's content addressability all
187/// data is ensured to have been signed-off by the maintainer.
188///
189/// When developing locally this checks the plausibility of cargo data and then tries to determine
190/// if `git` is in use (other VCS are welcome but need to be supported by cargo first).
191///
192/// ## Panics
193///
194/// This function _panics_ if any of the following is true:
195/// * The function is called outside of an integration test.
196/// * There is no VCS in use.
197/// * We could not determine how to use the VCS of the repository.
198/// * The repository URL as configured in `Cargo.toml` is not valid.
199/// * We could not create a bare repository in the directory `${CARGO_TARGET_TMPDIR}`.
200///
201/// When executing from the distribution form of a package, we will also panic if any of the
202/// following are true:
203/// * The commit ID that is being read from `.cargo_vcs_info.json` can not be fetched from the
204///   remote repository.
205/// * There is no `.cargo_vcs_info.json` and the manifest is _not_ in a VCS folder.
206///
207/// Note that the eventual call to `build()` has some additional panics.
208#[macro_export]
209macro_rules! setup {
210    () => {
211        $crate::_setup($crate::EnvOptions {
212            // FIXME: technically this isn't critical information.
213            // We could rely on the user passing one to us since we will fail when that is not a
214            // git repository with the correct commit ID. That's just their fault.
215            pkg_repository: env!("CARGO_PKG_REPOSITORY"),
216            manifest_dir: env!("CARGO_MANIFEST_DIR"),
217            target_tmpdir: option_env!("CARGO_TARGET_TMPDIR"),
218        })
219    };
220}
221
222#[doc(hidden)]
223pub fn _setup(options: EnvOptions) -> Setup<'static> {
224    let EnvOptions {
225        pkg_repository: repository,
226        manifest_dir: manifest,
227        target_tmpdir: tmpdir,
228    } = options;
229    if repository.is_empty() {
230        inconclusive(&mut "The crate must have a valid URL in `package.repository`");
231    }
232
233    // Now allow the override.
234    let repository = OsString::from(repository);
235
236    // Make sure this is an integration test, or at least we have the dir.
237    // We don't want to block building over this (e.g. the crate itself here) but we _do_ want to
238    // restrict running this `setup` function
239    let integration_test_tempdir = tmpdir.map(Path::new);
240
241    let vcs_info_path = env::var_os("CARGO_XTEST_VCS_INFO");
242    let force_vcs = vcs_info_path.is_some();
243
244    let vcs_info_path = vcs_info_path.as_ref().map_or_else(
245        || Path::new(manifest).join(".cargo_vcs_info.json"),
246        PathBuf::from,
247    );
248
249    let (source, pack_objects);
250    if vcs_info_path.exists() {
251        // Allow the override.
252        trait GetKey {
253            fn get_key(&self, key: &str) -> Option<&Self>;
254        }
255        impl GetKey for JsonValue {
256            fn get_key(&self, key: &str) -> Option<&Self> {
257                self.get::<std::collections::HashMap<_, _>>()?.get(key)
258            }
259        }
260
261        let data =
262            fs::read_to_string(vcs_info_path).unwrap_or_else(|mut err| inconclusive(&mut err));
263        let vcs: JsonValue = data
264            .parse()
265            .unwrap_or_else(|mut err| inconclusive(&mut err));
266        let commit_id = vcs
267            .get_key("git")
268            .unwrap_or_else(|| inconclusive(&mut "VCS does not contain a git section."))
269            .get_key("sha1")
270            .unwrap_or_else(|| inconclusive(&mut "VCS commit ID not recognized."))
271            .get::<String>()
272            .map(|id| git::CommitId::from(&**id))
273            .unwrap_or_else(|| inconclusive(&mut "VCS commit ID is not a string"));
274
275        // Okay, that makes sense. We know _what_ to access.
276        // Now let's also try to find out how we will access it. Let's find `git`.
277        // To shell out to because we are lazy.
278        let git = git::Git::new().unwrap_or_else(|mut err| inconclusive(&mut err));
279
280        let datadir = integration_test_tempdir
281            .map(Cow::Borrowed)
282            .or_else(|| {
283                    let environment_temp = std::env::var_os("CARGO_XTEST_DATA_TMPDIR")
284                        .or_else(|| std::env::var_os("TMPDIR"))
285                        .map(PathBuf::from)?;
286                    // TODO: nah, in this case we should have some distinguisher for the exact crate
287                    // name and version in the tmpdir. At least that would catch the gravest of errors
288                    // when testing many crates at the same time. (Although sharing the git dir would
289                    // be an advantage).
290                    Some(Cow::Owned(environment_temp))
291                })
292            .expect("This setup must only be called in an integration test or benchmark, or with an explicit TMPDIR")
293            .into_owned();
294
295        pack_objects = std::env::var_os("CARGO_XTEST_DATA_PACK_OBJECTS");
296        source = Source::VcsFromManifest {
297            commit_id,
298            git,
299            datadir,
300        };
301    } else if force_vcs {
302        inconclusive(&mut format!(
303            "Expected VCS info at {}",
304            vcs_info_path.display()
305        ));
306    } else {
307        // Check that we can recognize tracked files.
308        let git = git::Git::new().unwrap_or_else(|mut err| inconclusive(&mut err));
309        source = Source::Local(git);
310        pack_objects = std::env::var_os("CARGO_XTEST_DATA_PACK_OBJECTS");
311    };
312
313    // And finally this must be valid.
314    if repository.is_empty() {
315        inconclusive(&mut "The repository must have a valid URL");
316    }
317
318    Setup {
319        repository,
320        manifest,
321        source,
322        resources: Resources::default(),
323        pack_objects,
324    }
325}
326
327impl<'lt> Setup<'lt> {
328    /// Register some paths to rewrite their location.
329    ///
330    /// The paths should be relative to the crate's manifest. For example, to refer to data in your
331    /// `tests` directory you would use `PathBuf::from("tests/data.zip")`.
332    ///
333    /// The paths will be registered internally. If the repository is local they will be rewritten
334    /// to be relative to the manifest location. If the repository is a crate distribution then the
335    /// paths will be sparsely checked out (meaning: only that path will be downloaded from the VCS
336    /// working dir and you can't expect any other files to be present).
337    ///
338    /// Those actions will happen when you call [`Setup::build()`].
339    ///
340    /// # Example
341    ///
342    /// ```
343    /// use std::path::PathBuf;
344    /// use xtest_data::setup;
345    ///
346    /// let mut path = PathBuf::from("tests/data.zip");
347    /// setup!().rewrite([&mut path]).build();
348    ///
349    /// assert!(path.exists(), "{}", path.display());
350    /// ```
351    pub fn rewrite(mut self, iter: impl IntoIterator<Item = &'lt mut PathBuf>) -> Self {
352        self.resources.unmanaged.extend(iter);
353        self
354    }
355
356    /// Register the path of a file or a tree of files.
357    ///
358    /// The return value is a key that can later be used in [`FsData`]. All the files under this
359    /// location will be checked out when `Setup::build()` is called in a crate-build.
360    ///
361    /// # Example
362    ///
363    /// ```
364    /// let mut vcs = xtest_data::setup!();
365    /// let datazip = vcs.add("tests/data.zip");
366    /// let testdata = vcs.build();
367    ///
368    /// let path = testdata.path(&datazip);
369    /// assert!(path.exists(), "{}", path.display());
370    /// ```
371
372    pub fn add(&mut self, path: impl AsRef<Path>) -> Files {
373        fn path_impl(resources: &mut Resources, path: &Path) -> usize {
374            let item = Managed::Files(path.to_owned());
375            let key = resources.relative_files.len();
376            resources.relative_files.push(item);
377            key
378        }
379
380        let key = path_impl(&mut self.resources, path.as_ref());
381        Files { key }
382    }
383
384    /// Run the final validation and perform rewrites.
385    ///
386    /// Returns the frozen dictionary of file mappings that had been registered with
387    /// [`Setup::add()`]. This allows retrieving the final data paths for those items.
388    ///
389    /// ## Panics
390    ///
391    /// This will panic if:
392    /// * Any registered file or tree is not tracked in the VCS.
393    /// * You have not allowed retrieving data from the VCS.
394    /// * It was not possible to retrieve the data from the VCS.
395    pub fn build(self) -> FsData {
396        let mut map;
397        match self.source {
398            Source::Local(git) => {
399                let dir = git::CrateDir::new(self.manifest, &git);
400                let datapath = Path::new(self.manifest);
401                dir.tracked(&git, &mut self.resources.path_specs());
402
403                if let Some(pack_objects) = self.pack_objects {
404                    std::fs::create_dir_all(&pack_objects)
405                        .unwrap_or_else(|mut err| inconclusive(&mut err));
406                    dir.pack_objects(&git, &mut self.resources.path_specs(), pack_objects);
407                }
408
409                map = vec![];
410                self.resources.relative_files.iter().for_each(|path| {
411                    map.push(datapath.join(path.as_path()));
412                });
413
414                self.resources
415                    .unmanaged
416                    .into_iter()
417                    .for_each(|item| set_root(datapath, item));
418            }
419            Source::VcsFromManifest {
420                commit_id,
421                datadir,
422                git,
423            } => {
424                let origin = git::Origin {
425                    url: self.repository,
426                };
427
428                let gitpath = datadir.join("xtest-data-git");
429                let datapath = unique_dir(&datadir, "xtest-data-tree")
430                    .unwrap_or_else(|mut err| inconclusive(&mut err));
431
432                let shallow;
433                if let Some(pack_objects) = self.pack_objects {
434                    shallow = git.bare(gitpath, &commit_id);
435                    shallow.unpack(&git, &pack_objects);
436                } else {
437                    panic!("Requested test data from {} but have no packed artifacts to load. Provide an explicit path to a directory to unpack via the `CARGO_XTEST_DATA_PACK_OBJECTS` environment variable", Path::new(&origin.url).display());
438                }
439
440                shallow.checkout(
441                    &git,
442                    &datapath,
443                    &commit_id,
444                    &mut self.resources.path_specs(),
445                );
446                map = vec![];
447                self.resources.relative_files.iter().for_each(|path| {
448                    map.push(datapath.join(path.as_path()));
449                });
450                self.resources
451                    .unmanaged
452                    .into_iter()
453                    .for_each(|item| set_root(&datapath, item));
454            }
455        }
456
457        // In the end we just discard some information.
458        // We don't really need it anymore after the checks.
459        //
460        // TODO: of course we could avoid actually checking files onto the disk if we had some kind
461        // of `io::Read` abstraction that read them straight from `git cat` instead. But chances
462        // are you'll like your files and directory structures.
463        FsData { map }
464    }
465}
466
467impl Resources<'_> {
468    pub fn path_specs(&self) -> impl Iterator<Item = git::PathSpec<'_>> {
469        let values = self.relative_files.iter().map(Managed::as_path_spec);
470        let unmanaged = self.unmanaged.iter().map(|x| git::PathSpec::Path(&**x));
471        values.chain(unmanaged)
472    }
473}
474
475impl FsData {
476    /// Retrieve the rewritten path of a file or tree of files.
477    pub fn path(&self, file: &Files) -> &Path {
478        self.map.get(file.key).unwrap().as_path()
479    }
480}
481
482impl Managed {
483    pub fn as_path(&self) -> &Path {
484        match self {
485            Managed::Files(path) => path,
486        }
487    }
488
489    fn as_path_spec(&self) -> git::PathSpec<'_> {
490        match self {
491            Managed::Files(path) => git::PathSpec::Path(path),
492        }
493    }
494}
495
496fn set_root(path: &Path, dir: &mut PathBuf) {
497    *dir = path.join(&*dir)
498}
499
500// We do not use tempdir. This should already be done by our environment (e.g. cargo).
501fn unique_dir(base: &Path, prefix: &str) -> Result<PathBuf, std::io::Error> {
502    let mut rng = nanorand::tls::tls_rng();
503    assert!(matches!(
504        Path::new(prefix).components().next(),
505        Some(std::path::Component::Normal(_))
506    ));
507    assert!(Path::new(prefix).components().nth(1).is_none());
508
509    let mut buffer = prefix.to_string();
510    let mut generate_name = move || -> PathBuf {
511        use nanorand::Rng;
512        const TABLE: &str = "0123456789abcdef";
513        let num: [u8; 8] = rng.rand();
514
515        buffer.clear();
516        buffer.push_str(prefix);
517
518        for byte in num {
519            let (low, hi) = (usize::from(byte & 0xf), usize::from((byte >> 4) & 0xf));
520            buffer.push_str(&TABLE[low..low + 1]);
521            buffer.push_str(&TABLE[hi..hi + 1]);
522        }
523
524        base.join(&buffer)
525    };
526
527    loop {
528        let path = generate_name();
529        match fs::create_dir(&path) {
530            Ok(_) => return Ok(path),
531            Err(err) if err.kind() == io::ErrorKind::AlreadyExists => {}
532            Err(other) => return Err(other),
533        }
534    }
535}
536
537#[cold]
538#[track_caller]
539fn inconclusive(err: &mut dyn std::fmt::Display) -> ! {
540    eprintln!("xtest-data failed to setup.");
541    eprintln!("Information: {}", err);
542    panic!();
543}