Skip to main content

codec_corpus/
lib.rs

1//! # codec-corpus
2//!
3//! Runtime API for downloading, caching, and accessing test image datasets
4//! from the [`imazen/codec-corpus`](https://github.com/imazen/codec-corpus)
5//! GitHub repository.
6//!
7//! No data ships with the crate. Datasets are fetched lazily on first access
8//! and cached locally.
9//!
10//! ```no_run
11//! let corpus = codec_corpus::Corpus::new().unwrap();
12//! let valid = corpus.get("webp-conformance/valid").unwrap();
13//! for entry in std::fs::read_dir(valid).unwrap() {
14//!     let path = entry.unwrap().path();
15//!     println!("{}", path.display());
16//! }
17//! ```
18
19#![forbid(unsafe_code)]
20
21mod download;
22
23use std::path::{Path, PathBuf};
24use std::time::Duration;
25
26// ---------------------------------------------------------------------------
27// Embedded metadata
28// ---------------------------------------------------------------------------
29
30const REPO_URL: &str = "https://github.com/imazen/codec-corpus";
31const CRATE_VERSION: &str = env!("CARGO_PKG_VERSION");
32
33// ---------------------------------------------------------------------------
34// Public types
35// ---------------------------------------------------------------------------
36
37/// Errors that can occur when using the corpus.
38#[derive(Debug)]
39#[non_exhaustive]
40pub enum Error {
41    /// Network unavailable and dataset not in cache.
42    NetworkUnavailable { dataset: String },
43    /// Requested path does not exist after successful download.
44    PathNotFound { path: String },
45    /// Filesystem error (permissions, disk full, etc.).
46    Io(std::io::Error),
47    /// No cache directory could be determined.
48    NoCacheDir,
49}
50
51impl std::fmt::Display for Error {
52    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
53        match self {
54            Error::NetworkUnavailable { dataset } => {
55                write!(f, "network unavailable and dataset '{dataset}' not cached")
56            }
57            Error::PathNotFound { path } => {
58                write!(f, "path not found: '{path}'")
59            }
60            Error::Io(e) => write!(f, "I/O error: {e}"),
61            Error::NoCacheDir => write!(f, "could not determine cache directory"),
62        }
63    }
64}
65
66impl std::error::Error for Error {
67    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
68        match self {
69            Error::Io(e) => Some(e),
70            _ => None,
71        }
72    }
73}
74
75impl From<std::io::Error> for Error {
76    fn from(e: std::io::Error) -> Self {
77        Error::Io(e)
78    }
79}
80
81// ---------------------------------------------------------------------------
82// Corpus
83// ---------------------------------------------------------------------------
84
85/// Handle for accessing cached test-image datasets.
86///
87/// Create with [`Corpus::new()`] (default cache location) or
88/// [`Corpus::with_cache_root()`] (explicit path). Then call [`Corpus::get()`]
89/// to obtain the local path to a dataset — downloading it if necessary.
90pub struct Corpus {
91    root: PathBuf,
92}
93
94impl Corpus {
95    /// Initialize with default cache location.
96    ///
97    /// Resolves cache root via `CODEC_CORPUS_CACHE` env var, then
98    /// `dirs::cache_dir()`. Performs no I/O beyond directory creation.
99    pub fn new() -> Result<Self, Error> {
100        let base = if let Ok(val) = std::env::var("CODEC_CORPUS_CACHE") {
101            PathBuf::from(val)
102        } else {
103            dirs::cache_dir().ok_or(Error::NoCacheDir)?
104        };
105        Self::init(base)
106    }
107
108    /// Initialize with explicit cache root. Overrides the environment variable.
109    ///
110    /// Files will live at `{path}/codec-corpus/v{major}/`.
111    pub fn with_cache_root(path: impl Into<PathBuf>) -> Result<Self, Error> {
112        Self::init(path.into())
113    }
114
115    /// Get the local path to a corpus subdirectory, downloading if needed.
116    ///
117    /// `path` is any path into the repository (e.g. `"webp-conformance"`,
118    /// `"webp-conformance/valid"`, `"clic2025/training"`). The top-level
119    /// folder is the download unit — requesting any path under it fetches
120    /// the entire folder recursively.
121    ///
122    /// # Examples
123    ///
124    /// ```no_run
125    /// let corpus = codec_corpus::Corpus::new()?;
126    /// let webp = corpus.get("webp-conformance")?;
127    /// let valid = corpus.get("webp-conformance/valid")?;
128    /// let training = corpus.get("clic2025/training")?;
129    /// # Ok::<(), codec_corpus::Error>(())
130    /// ```
131    pub fn get(&self, path: &str) -> Result<PathBuf, Error> {
132        let top = top_level_folder(path);
133        let full_path = self.root.join(path);
134
135        // Fast path: version matches and directory exists
136        if self.version_matches() && full_path.exists() {
137            return Ok(full_path);
138        }
139
140        // Slow path: download the top-level folder
141        self.ensure_downloaded(top)?;
142
143        if full_path.exists() {
144            Ok(full_path)
145        } else {
146            Err(Error::PathNotFound {
147                path: path.to_string(),
148            })
149        }
150    }
151
152    /// Check if a path is already cached locally, without downloading.
153    pub fn is_cached(&self, path: &str) -> bool {
154        self.version_matches() && self.root.join(path).exists()
155    }
156
157    /// List datasets currently cached on disk.
158    ///
159    /// Returns directory names under the cache root, excluding internal
160    /// files (`.version`, `.lock`, `.tmp-*`).
161    pub fn list_cached(&self) -> Vec<String> {
162        let mut datasets = Vec::new();
163        let Ok(entries) = std::fs::read_dir(&self.root) else {
164            return datasets;
165        };
166        for entry in entries.flatten() {
167            let name = entry.file_name();
168            let name_str = name.to_string_lossy();
169            if name_str.starts_with('.') {
170                continue;
171            }
172            if entry.path().is_dir() {
173                datasets.push(name_str.into_owned());
174            }
175        }
176        datasets.sort();
177        datasets
178    }
179
180    // -----------------------------------------------------------------------
181    // Private helpers
182    // -----------------------------------------------------------------------
183
184    fn init(base: PathBuf) -> Result<Self, Error> {
185        let major = CRATE_VERSION
186            .split('.')
187            .next()
188            .unwrap_or("0");
189        let root = base.join("codec-corpus").join(format!("v{major}"));
190        std::fs::create_dir_all(&root).map_err(Error::Io)?;
191        Ok(Self { root })
192    }
193
194    fn version_matches(&self) -> bool {
195        let version_file = self.root.join(".version");
196        std::fs::read_to_string(&version_file)
197            .map(|v| v.trim() == CRATE_VERSION)
198            .unwrap_or(false)
199    }
200
201    /// Download the top-level folder that contains `folder`.
202    fn ensure_downloaded(&self, folder: &str) -> Result<(), Error> {
203        let lock_path = self.root.join(".lock");
204        let lock_file = std::fs::File::create(&lock_path).map_err(Error::Io)?;
205        let mut lock = fd_lock::RwLock::new(lock_file);
206        let _guard = lock.write().map_err(Error::Io)?;
207
208        // Re-check after acquiring lock — another process may have finished
209        if self.version_matches() && self.root.join(folder).is_dir() {
210            cleanup_old_temps(&self.root);
211            return Ok(());
212        }
213
214        // If version mismatch, we need to re-download
215        let need_version_reset = !self.version_matches();
216
217        if need_version_reset {
218            self.clear_datasets();
219        }
220
221        // Git sparse-checks out just the folder; HTTP downloads the
222        // root-folder tarball (which may include sibling paths).
223        let download_result = download::try_git_sparse_checkout(
224            &self.root,
225            folder,
226            CRATE_VERSION,
227            REPO_URL,
228        )
229        .or_else(|_| download::try_http_download(&self.root, folder, CRATE_VERSION));
230
231        cleanup_old_temps(&self.root);
232        download_result?;
233
234        write_version_file(&self.root, CRATE_VERSION)?;
235        Ok(())
236    }
237
238    fn clear_datasets(&self) {
239        if let Ok(entries) = std::fs::read_dir(&self.root) {
240            for entry in entries.flatten() {
241                let name = entry.file_name();
242                let name_str = name.to_string_lossy();
243                // Keep .lock and .tmp-* entries (temps cleaned separately)
244                if name_str == ".lock" || name_str.starts_with(".tmp-") {
245                    continue;
246                }
247                let path = entry.path();
248                if path.is_dir() {
249                    let _ = std::fs::remove_dir_all(&path);
250                } else {
251                    let _ = std::fs::remove_file(&path);
252                }
253            }
254        }
255    }
256}
257
258// ---------------------------------------------------------------------------
259// Free helpers
260// ---------------------------------------------------------------------------
261
262/// Extract the first path component (the root folder / download unit).
263fn top_level_folder(path: &str) -> &str {
264    path.split('/').next().unwrap_or(path)
265}
266
267/// Atomically write the version file (write to temp, then rename).
268fn write_version_file(root: &std::path::Path, version: &str) -> Result<(), Error> {
269    let version_file = root.join(".version");
270    let tmp = root.join(".version.tmp");
271    std::fs::write(&tmp, version).map_err(Error::Io)?;
272    std::fs::rename(&tmp, &version_file).map_err(Error::Io)?;
273    Ok(())
274}
275
276/// Remove `.tmp-*` entries older than 1 hour (orphaned from crashed runs).
277fn cleanup_old_temps(root: &Path) {
278    let one_hour = Duration::from_secs(3600);
279    let Ok(entries) = std::fs::read_dir(root) else {
280        return;
281    };
282
283    for entry in entries.flatten() {
284        let name = entry.file_name();
285        let name_str = name.to_string_lossy();
286        if !name_str.starts_with(".tmp-") {
287            continue;
288        }
289        let Ok(meta) = entry.metadata() else {
290            continue;
291        };
292        let age = meta
293            .modified()
294            .ok()
295            .and_then(|t| t.elapsed().ok())
296            .unwrap_or_default();
297        if age > one_hour {
298            let path = entry.path();
299            if path.is_dir() {
300                let _ = std::fs::remove_dir_all(&path);
301            } else {
302                let _ = std::fs::remove_file(&path);
303            }
304        }
305    }
306}
307
308#[cfg(test)]
309mod tests {
310    use super::*;
311
312    #[test]
313    fn test_top_level_folder() {
314        assert_eq!(top_level_folder("webp-conformance"), "webp-conformance");
315        assert_eq!(top_level_folder("webp-conformance/valid"), "webp-conformance");
316        assert_eq!(top_level_folder("clic2025/training/subdir"), "clic2025");
317    }
318
319    #[test]
320    fn test_list_cached_empty() {
321        let tmp = std::env::temp_dir().join("codec-corpus-test-list-cached");
322        let _ = std::fs::remove_dir_all(&tmp);
323        let corpus = Corpus::with_cache_root(&tmp).unwrap();
324        assert!(corpus.list_cached().is_empty());
325        let _ = std::fs::remove_dir_all(tmp);
326    }
327
328    #[test]
329    fn test_list_cached_with_dirs() {
330        let tmp = std::env::temp_dir().join("codec-corpus-test-list-cached2");
331        let _ = std::fs::remove_dir_all(&tmp);
332        let corpus = Corpus::with_cache_root(&tmp).unwrap();
333        // Create fake dataset dirs
334        std::fs::create_dir_all(corpus.root.join("alpha")).unwrap();
335        std::fs::create_dir_all(corpus.root.join("beta")).unwrap();
336        // Hidden dirs should be excluded
337        std::fs::create_dir_all(corpus.root.join(".tmp-123")).unwrap();
338        let cached = corpus.list_cached();
339        assert_eq!(cached, vec!["alpha", "beta"]);
340        let _ = std::fs::remove_dir_all(tmp);
341    }
342
343    #[test]
344    fn test_unknown_dataset_downloads() {
345        // With no hardcoded list, any name is accepted (will fail at download)
346        let tmp = std::env::temp_dir().join("codec-corpus-test-any-name");
347        let _ = std::fs::remove_dir_all(&tmp);
348        let corpus = Corpus::with_cache_root(&tmp).unwrap();
349        let result = corpus.get("nonexistent-dataset");
350        // Should fail with NetworkUnavailable, not UnknownDataset
351        assert!(matches!(result, Err(Error::NetworkUnavailable { .. })));
352        let _ = std::fs::remove_dir_all(tmp);
353    }
354
355    #[test]
356    fn test_is_cached_empty() {
357        let tmp = std::env::temp_dir().join("codec-corpus-test-cached");
358        let _ = std::fs::remove_dir_all(&tmp);
359        let corpus = Corpus::with_cache_root(&tmp).unwrap();
360        assert!(!corpus.is_cached("webp-conformance"));
361        let _ = std::fs::remove_dir_all(tmp);
362    }
363
364    #[test]
365    fn test_version_matches() {
366        let tmp = std::env::temp_dir().join("codec-corpus-test-version");
367        let _ = std::fs::remove_dir_all(&tmp);
368        let corpus = Corpus::with_cache_root(&tmp).unwrap();
369        assert!(!corpus.version_matches());
370
371        write_version_file(&corpus.root, CRATE_VERSION).unwrap();
372        assert!(corpus.version_matches());
373
374        write_version_file(&corpus.root, "0.0.0-fake").unwrap();
375        assert!(!corpus.version_matches());
376
377        let _ = std::fs::remove_dir_all(tmp);
378    }
379}