xtest_data/lib.rs
1//! Fetch test data in packaged crate tests.
2//!
3//! # For crate authors
4//!
5//! Drop these lines into your _integration tests_ (due to a limitation in `cargo` this will only
6//! work in integration tests right now¹). Note that this requires your repository—through the URL
7//! contained in `Cargo.toml`—to be readable by the environment where you wish to test the packaged
8//! crate.
9//!
10//! ```rust
11//! use std::path::PathBuf;
12//!
13//! // or any other file you want to use.
14//! let mut datazip = PathBuf::from("tests/data.zip");
15//! xtest_data::setup!().rewrite([&mut datazip]).build();
16//!
17//! // … and the crate works its magic to make this succeed.
18//! assert!(datazip.exists(), "{}", datazip.display());
19//! ```
20//!
21//! ¹The crate uses a directory to store a shallow clone of the source repository. Only integration
22//! tests have the environment variable that cargo uses to communicate a temporary directory within
23//! its `target` folder. That temporary directory is required for our choice.
24//!
25//! Configure meta data in your `Cargo.toml` according to the guide to inform packagers where they
26//! will find the test data. See the [Readme](crate::readme) in the sources for an example. When
27//! you're ready to deploy, let the xtest-data binary pack all necessary data files from your
28//! source tree:
29//!
30//! ```bash
31//! cargo xtest-data pack
32//! ```
33//!
34//! Make sure to upload the archive according to your published configuration!
35//!
36//! # For packagers
37//!
38//! ```bash
39//! cargo install xtest-data --features=bin-xtask
40//! ```
41//!
42//! Any `.crate` file you have downloaded is a `.tar.gz` in disguise. When you unpack it for your
43//! local build steps etc., you may verify that this package contains `Cargo.toml.orig` as well as
44//! a `.cargo_vcs_info.json` file; and that the latter file has git commit information.
45//!
46//! The binary orchestrates fetching a shallow pack of the requested data sources from the upstream
47//! repository, or consuming one from your local filesystem if you rather do the networking
48//! yourself. The basic structure you want is:
49//!
50//! ```bash
51//! cargo xtest-data test-crate /path/to/your.crate [--pack-artifact /path/to/pack-artifact]
52//! ```
53#![forbid(unsafe_code)]
54mod git;
55
56use std::{borrow::Cow, env, ffi::OsString, fs, io, path::Path, path::PathBuf};
57use tinyjson::JsonValue;
58
59#[cfg(doc)]
60/// Find the Readme and further documentation here, only present in the docs build.
61#[doc = include_str!("../Readme.md")]
62pub mod readme {}
63
64/// A file or tree that was registered from [`Setup`].
65///
66/// This is a key into [`FsData`]. You can retrieve the local path using [`FsData::path()`]. The
67/// returned path is either the local path on disk, when you are currently developing under a local
68/// checkout of the version control system, or the path into which the file has been checked out.
69#[derive(Debug)]
70pub struct Files {
71 key: usize,
72}
73
74#[derive(Debug)]
75enum Managed {
76 // TODO: have a spec for the glob `<dir>/**.ext`?
77 Files(PathBuf),
78}
79
80type FsItem<'lt> = &'lt mut PathBuf;
81
82/// The product of `Setup`, ensuring local file system accessible test resources.
83///
84/// This object is used to retrieve the local paths of resources that have been registered with the
85/// method [`Setup::add()`].
86#[derive(Debug)]
87pub struct FsData {
88 /// Map all configured items to their paths.
89 /// This map will essentially be constant and we do not care about the VCS interpretation.
90 map: Vec<PathBuf>,
91}
92
93#[derive(Debug)]
94enum Source {
95 /// The data source is the crate's repository at a specific commit id.
96 VcsFromManifest {
97 /// TODO: we should support other commit identifiers.
98 commit_id: git::CommitId,
99 /// Evidence how we plan to access the source.
100 git: git::Git,
101 /// The directory where we may put git-dir and checkout of the resources.
102 datadir: PathBuf,
103 },
104 /// The data will be relative to the crate manifest.
105 Local(git::Git),
106}
107
108#[derive(Default, Debug)]
109struct Resources<'paths> {
110 /// All files and tree that are owned by the `Setup`.
111 /// Note: we never intend to remove anything from here. If we did we would have to do some kind
112 /// of remapping data structure to ensure that `Files` does not access the wrong item.
113 relative_files: Vec<Managed>,
114 /// Resources where we do 'simple' path replacement in a filter style.
115 ///
116 /// Note on ergonomics: We MAY take several different kinds of paths in the future to allow the
117 /// glob-style usage (`tests/samples/*.png`) to be efficiently executed. However, we should NOT
118 /// change the public API for this. We may well do some wrapping internally but the calls
119 /// should map to exactly one variant of any such item; and the enum variant should not be
120 /// directly exposed.
121 ///
122 /// This is based on the needs to perform more imports and additional calls to wrap locals in
123 /// those items. Basically, adding the crate should not be much more complex than making all
124 /// paths a variable and then throwing a `xtest_data::setup!()` on top.
125 unmanaged: Vec<FsItem<'paths>>,
126}
127
128/// A builder to configure desired test data paths.
129///
130/// This is created through [`setup!`] instead of a usual method as it must gather some information
131/// from the _callers_ environment first.
132///
133/// This is a builder and after configuration, its [`Setup::build()`] method should be called. Note
134/// the lifetime on this struct. This is either the lifetime of paths borrowed from the caller,
135/// which it will rewrite, or it can be `'static` when it owns all of the paths. The latter case
136/// requires them to be registered with [`Setup::add()`].
137///
138/// On a VCS copy of the surrounding package this will simply collect and validate the information,
139/// canonicalizing paths to be interpreted from the Manifest in the process.
140///
141/// However, when executed in the source tree from `.crate` then it will rewrite them all to refer
142/// to a local copy of the data instead. That is, if it is allowed to, since by default we merely
143/// provide a detailed report of data paths, repository location, and commit information that would
144/// _need_ to be fetched before aborting. When the environment has opted into our access of network
145/// (and might have overridden the repository path) then we will perform the actual access,
146/// checkout, and rewrite.
147#[must_use = "This is only a builder. Call `build` to perform validation/fetch/etc."]
148#[derive(Debug)]
149pub struct Setup<'paths> {
150 repository: OsString,
151 manifest: &'static str,
152 /// Have we determined to be local or in a crate?.
153 source: Source,
154 /// The resources that we store.
155 resources: Resources<'paths>,
156 /// A git pack archive with files.
157 pack_objects: Option<OsString>,
158}
159
160/// The options determined from the compile time environment of the crate that called us.
161///
162/// This is every environment data we are gather from the `setup` macro, which allows us to get the
163/// environment flags passed to the _calling_ crate instead of our own. Please do not construct
164/// this directly since doing so could affect the integrity of the information.
165///
166/// This is independent from the data gathered from the _runtime_ environment. It is combined with
167/// that information in `Setup::build`.
168#[doc(hidden)]
169pub struct EnvOptions {
170 pub pkg_repository: &'static str,
171 pub manifest_dir: &'static str,
172 pub target_tmpdir: Option<&'static str>,
173}
174
175/// Create a builder to configure local test data.
176///
177/// This evaluates to an instance of [`Setup`].
178///
179/// This can be ran in _integration tests_ (and in integration tests only) to ensure that those can
180/// be replicated from a source distribution of the package, while actually using additional data
181/// stored in your repository. The commit ID of the head, stored inside the package, is used for
182/// bit-by-bit reproducibility of the test data.
183///
184/// You can rely on this package only using data within the git tree associated with the commit ID
185/// stored in the package. As a tester downstream, if the maintainer of the package signs their
186/// crates, and you validate that signature, then by extension and Git's content addressability all
187/// data is ensured to have been signed-off by the maintainer.
188///
189/// When developing locally this checks the plausibility of cargo data and then tries to determine
190/// if `git` is in use (other VCS are welcome but need to be supported by cargo first).
191///
192/// ## Panics
193///
194/// This function _panics_ if any of the following is true:
195/// * The function is called outside of an integration test.
196/// * There is no VCS in use.
197/// * We could not determine how to use the VCS of the repository.
198/// * The repository URL as configured in `Cargo.toml` is not valid.
199/// * We could not create a bare repository in the directory `${CARGO_TARGET_TMPDIR}`.
200///
201/// When executing from the distribution form of a package, we will also panic if any of the
202/// following are true:
203/// * The commit ID that is being read from `.cargo_vcs_info.json` can not be fetched from the
204/// remote repository.
205/// * There is no `.cargo_vcs_info.json` and the manifest is _not_ in a VCS folder.
206///
207/// Note that the eventual call to `build()` has some additional panics.
208#[macro_export]
209macro_rules! setup {
210 () => {
211 $crate::_setup($crate::EnvOptions {
212 // FIXME: technically this isn't critical information.
213 // We could rely on the user passing one to us since we will fail when that is not a
214 // git repository with the correct commit ID. That's just their fault.
215 pkg_repository: env!("CARGO_PKG_REPOSITORY"),
216 manifest_dir: env!("CARGO_MANIFEST_DIR"),
217 target_tmpdir: option_env!("CARGO_TARGET_TMPDIR"),
218 })
219 };
220}
221
222#[doc(hidden)]
223pub fn _setup(options: EnvOptions) -> Setup<'static> {
224 let EnvOptions {
225 pkg_repository: repository,
226 manifest_dir: manifest,
227 target_tmpdir: tmpdir,
228 } = options;
229 if repository.is_empty() {
230 inconclusive(&mut "The crate must have a valid URL in `package.repository`");
231 }
232
233 // Now allow the override.
234 let repository = OsString::from(repository);
235
236 // Make sure this is an integration test, or at least we have the dir.
237 // We don't want to block building over this (e.g. the crate itself here) but we _do_ want to
238 // restrict running this `setup` function
239 let integration_test_tempdir = tmpdir.map(Path::new);
240
241 let vcs_info_path = env::var_os("CARGO_XTEST_VCS_INFO");
242 let force_vcs = vcs_info_path.is_some();
243
244 let vcs_info_path = vcs_info_path.as_ref().map_or_else(
245 || Path::new(manifest).join(".cargo_vcs_info.json"),
246 PathBuf::from,
247 );
248
249 let (source, pack_objects);
250 if vcs_info_path.exists() {
251 // Allow the override.
252 trait GetKey {
253 fn get_key(&self, key: &str) -> Option<&Self>;
254 }
255 impl GetKey for JsonValue {
256 fn get_key(&self, key: &str) -> Option<&Self> {
257 self.get::<std::collections::HashMap<_, _>>()?.get(key)
258 }
259 }
260
261 let data =
262 fs::read_to_string(vcs_info_path).unwrap_or_else(|mut err| inconclusive(&mut err));
263 let vcs: JsonValue = data
264 .parse()
265 .unwrap_or_else(|mut err| inconclusive(&mut err));
266 let commit_id = vcs
267 .get_key("git")
268 .unwrap_or_else(|| inconclusive(&mut "VCS does not contain a git section."))
269 .get_key("sha1")
270 .unwrap_or_else(|| inconclusive(&mut "VCS commit ID not recognized."))
271 .get::<String>()
272 .map(|id| git::CommitId::from(&**id))
273 .unwrap_or_else(|| inconclusive(&mut "VCS commit ID is not a string"));
274
275 // Okay, that makes sense. We know _what_ to access.
276 // Now let's also try to find out how we will access it. Let's find `git`.
277 // To shell out to because we are lazy.
278 let git = git::Git::new().unwrap_or_else(|mut err| inconclusive(&mut err));
279
280 let datadir = integration_test_tempdir
281 .map(Cow::Borrowed)
282 .or_else(|| {
283 let environment_temp = std::env::var_os("CARGO_XTEST_DATA_TMPDIR")
284 .or_else(|| std::env::var_os("TMPDIR"))
285 .map(PathBuf::from)?;
286 // TODO: nah, in this case we should have some distinguisher for the exact crate
287 // name and version in the tmpdir. At least that would catch the gravest of errors
288 // when testing many crates at the same time. (Although sharing the git dir would
289 // be an advantage).
290 Some(Cow::Owned(environment_temp))
291 })
292 .expect("This setup must only be called in an integration test or benchmark, or with an explicit TMPDIR")
293 .into_owned();
294
295 pack_objects = std::env::var_os("CARGO_XTEST_DATA_PACK_OBJECTS");
296 source = Source::VcsFromManifest {
297 commit_id,
298 git,
299 datadir,
300 };
301 } else if force_vcs {
302 inconclusive(&mut format!(
303 "Expected VCS info at {}",
304 vcs_info_path.display()
305 ));
306 } else {
307 // Check that we can recognize tracked files.
308 let git = git::Git::new().unwrap_or_else(|mut err| inconclusive(&mut err));
309 source = Source::Local(git);
310 pack_objects = std::env::var_os("CARGO_XTEST_DATA_PACK_OBJECTS");
311 };
312
313 // And finally this must be valid.
314 if repository.is_empty() {
315 inconclusive(&mut "The repository must have a valid URL");
316 }
317
318 Setup {
319 repository,
320 manifest,
321 source,
322 resources: Resources::default(),
323 pack_objects,
324 }
325}
326
327impl<'lt> Setup<'lt> {
328 /// Register some paths to rewrite their location.
329 ///
330 /// The paths should be relative to the crate's manifest. For example, to refer to data in your
331 /// `tests` directory you would use `PathBuf::from("tests/data.zip")`.
332 ///
333 /// The paths will be registered internally. If the repository is local they will be rewritten
334 /// to be relative to the manifest location. If the repository is a crate distribution then the
335 /// paths will be sparsely checked out (meaning: only that path will be downloaded from the VCS
336 /// working dir and you can't expect any other files to be present).
337 ///
338 /// Those actions will happen when you call [`Setup::build()`].
339 ///
340 /// # Example
341 ///
342 /// ```
343 /// use std::path::PathBuf;
344 /// use xtest_data::setup;
345 ///
346 /// let mut path = PathBuf::from("tests/data.zip");
347 /// setup!().rewrite([&mut path]).build();
348 ///
349 /// assert!(path.exists(), "{}", path.display());
350 /// ```
351 pub fn rewrite(mut self, iter: impl IntoIterator<Item = &'lt mut PathBuf>) -> Self {
352 self.resources.unmanaged.extend(iter);
353 self
354 }
355
356 /// Register the path of a file or a tree of files.
357 ///
358 /// The return value is a key that can later be used in [`FsData`]. All the files under this
359 /// location will be checked out when `Setup::build()` is called in a crate-build.
360 ///
361 /// # Example
362 ///
363 /// ```
364 /// let mut vcs = xtest_data::setup!();
365 /// let datazip = vcs.add("tests/data.zip");
366 /// let testdata = vcs.build();
367 ///
368 /// let path = testdata.path(&datazip);
369 /// assert!(path.exists(), "{}", path.display());
370 /// ```
371
372 pub fn add(&mut self, path: impl AsRef<Path>) -> Files {
373 fn path_impl(resources: &mut Resources, path: &Path) -> usize {
374 let item = Managed::Files(path.to_owned());
375 let key = resources.relative_files.len();
376 resources.relative_files.push(item);
377 key
378 }
379
380 let key = path_impl(&mut self.resources, path.as_ref());
381 Files { key }
382 }
383
384 /// Run the final validation and perform rewrites.
385 ///
386 /// Returns the frozen dictionary of file mappings that had been registered with
387 /// [`Setup::add()`]. This allows retrieving the final data paths for those items.
388 ///
389 /// ## Panics
390 ///
391 /// This will panic if:
392 /// * Any registered file or tree is not tracked in the VCS.
393 /// * You have not allowed retrieving data from the VCS.
394 /// * It was not possible to retrieve the data from the VCS.
395 pub fn build(self) -> FsData {
396 let mut map;
397 match self.source {
398 Source::Local(git) => {
399 let dir = git::CrateDir::new(self.manifest, &git);
400 let datapath = Path::new(self.manifest);
401 dir.tracked(&git, &mut self.resources.path_specs());
402
403 if let Some(pack_objects) = self.pack_objects {
404 std::fs::create_dir_all(&pack_objects)
405 .unwrap_or_else(|mut err| inconclusive(&mut err));
406 dir.pack_objects(&git, &mut self.resources.path_specs(), pack_objects);
407 }
408
409 map = vec![];
410 self.resources.relative_files.iter().for_each(|path| {
411 map.push(datapath.join(path.as_path()));
412 });
413
414 self.resources
415 .unmanaged
416 .into_iter()
417 .for_each(|item| set_root(datapath, item));
418 }
419 Source::VcsFromManifest {
420 commit_id,
421 datadir,
422 git,
423 } => {
424 let origin = git::Origin {
425 url: self.repository,
426 };
427
428 let gitpath = datadir.join("xtest-data-git");
429 let datapath = unique_dir(&datadir, "xtest-data-tree")
430 .unwrap_or_else(|mut err| inconclusive(&mut err));
431
432 let shallow;
433 if let Some(pack_objects) = self.pack_objects {
434 shallow = git.bare(gitpath, &commit_id);
435 shallow.unpack(&git, &pack_objects);
436 } else {
437 panic!("Requested test data from {} but have no packed artifacts to load. Provide an explicit path to a directory to unpack via the `CARGO_XTEST_DATA_PACK_OBJECTS` environment variable", Path::new(&origin.url).display());
438 }
439
440 shallow.checkout(
441 &git,
442 &datapath,
443 &commit_id,
444 &mut self.resources.path_specs(),
445 );
446 map = vec![];
447 self.resources.relative_files.iter().for_each(|path| {
448 map.push(datapath.join(path.as_path()));
449 });
450 self.resources
451 .unmanaged
452 .into_iter()
453 .for_each(|item| set_root(&datapath, item));
454 }
455 }
456
457 // In the end we just discard some information.
458 // We don't really need it anymore after the checks.
459 //
460 // TODO: of course we could avoid actually checking files onto the disk if we had some kind
461 // of `io::Read` abstraction that read them straight from `git cat` instead. But chances
462 // are you'll like your files and directory structures.
463 FsData { map }
464 }
465}
466
467impl Resources<'_> {
468 pub fn path_specs(&self) -> impl Iterator<Item = git::PathSpec<'_>> {
469 let values = self.relative_files.iter().map(Managed::as_path_spec);
470 let unmanaged = self.unmanaged.iter().map(|x| git::PathSpec::Path(&**x));
471 values.chain(unmanaged)
472 }
473}
474
475impl FsData {
476 /// Retrieve the rewritten path of a file or tree of files.
477 pub fn path(&self, file: &Files) -> &Path {
478 self.map.get(file.key).unwrap().as_path()
479 }
480}
481
482impl Managed {
483 pub fn as_path(&self) -> &Path {
484 match self {
485 Managed::Files(path) => path,
486 }
487 }
488
489 fn as_path_spec(&self) -> git::PathSpec<'_> {
490 match self {
491 Managed::Files(path) => git::PathSpec::Path(path),
492 }
493 }
494}
495
496fn set_root(path: &Path, dir: &mut PathBuf) {
497 *dir = path.join(&*dir)
498}
499
500// We do not use tempdir. This should already be done by our environment (e.g. cargo).
501fn unique_dir(base: &Path, prefix: &str) -> Result<PathBuf, std::io::Error> {
502 let mut rng = nanorand::tls::tls_rng();
503 assert!(matches!(
504 Path::new(prefix).components().next(),
505 Some(std::path::Component::Normal(_))
506 ));
507 assert!(Path::new(prefix).components().nth(1).is_none());
508
509 let mut buffer = prefix.to_string();
510 let mut generate_name = move || -> PathBuf {
511 use nanorand::Rng;
512 const TABLE: &str = "0123456789abcdef";
513 let num: [u8; 8] = rng.rand();
514
515 buffer.clear();
516 buffer.push_str(prefix);
517
518 for byte in num {
519 let (low, hi) = (usize::from(byte & 0xf), usize::from((byte >> 4) & 0xf));
520 buffer.push_str(&TABLE[low..low + 1]);
521 buffer.push_str(&TABLE[hi..hi + 1]);
522 }
523
524 base.join(&buffer)
525 };
526
527 loop {
528 let path = generate_name();
529 match fs::create_dir(&path) {
530 Ok(_) => return Ok(path),
531 Err(err) if err.kind() == io::ErrorKind::AlreadyExists => {}
532 Err(other) => return Err(other),
533 }
534 }
535}
536
537#[cold]
538#[track_caller]
539fn inconclusive(err: &mut dyn std::fmt::Display) -> ! {
540 eprintln!("xtest-data failed to setup.");
541 eprintln!("Information: {}", err);
542 panic!();
543}