Skip to main content

codec_corpus/
lib.rs

1//! # codec-corpus
2//!
3//! Runtime API for downloading, caching, and accessing test image datasets
4//! from the [`imazen/codec-corpus`](https://github.com/imazen/codec-corpus)
5//! GitHub repository.
6//!
7//! No data ships with the crate. Datasets are fetched lazily on first access
8//! and cached locally.
9//!
10//! ```no_run
11//! let corpus = codec_corpus::Corpus::new().unwrap();
12//! let valid = corpus.get("webp-conformance/valid").unwrap();
13//! for entry in std::fs::read_dir(valid).unwrap() {
14//!     let path = entry.unwrap().path();
15//!     println!("{}", path.display());
16//! }
17//! ```
18
19mod download;
20
21use std::path::PathBuf;
22use std::time::Duration;
23
24// ---------------------------------------------------------------------------
25// Embedded metadata
26// ---------------------------------------------------------------------------
27
28const REPO_URL: &str = "https://github.com/imazen/codec-corpus";
29const CRATE_VERSION: &str = env!("CARGO_PKG_VERSION");
30
31// ---------------------------------------------------------------------------
32// Public types
33// ---------------------------------------------------------------------------
34
35/// Errors that can occur when using the corpus.
36#[derive(Debug)]
37#[non_exhaustive]
38pub enum Error {
39    /// Network unavailable and dataset not in cache.
40    NetworkUnavailable { dataset: String },
41    /// Requested path does not exist after successful download.
42    PathNotFound { path: String },
43    /// Filesystem error (permissions, disk full, etc.).
44    Io(std::io::Error),
45    /// No cache directory could be determined.
46    NoCacheDir,
47}
48
49impl std::fmt::Display for Error {
50    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
51        match self {
52            Error::NetworkUnavailable { dataset } => {
53                write!(f, "network unavailable and dataset '{dataset}' not cached")
54            }
55            Error::PathNotFound { path } => {
56                write!(f, "path not found: '{path}'")
57            }
58            Error::Io(e) => write!(f, "I/O error: {e}"),
59            Error::NoCacheDir => write!(f, "could not determine cache directory"),
60        }
61    }
62}
63
64impl std::error::Error for Error {
65    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
66        match self {
67            Error::Io(e) => Some(e),
68            _ => None,
69        }
70    }
71}
72
73impl From<std::io::Error> for Error {
74    fn from(e: std::io::Error) -> Self {
75        Error::Io(e)
76    }
77}
78
79// ---------------------------------------------------------------------------
80// Corpus
81// ---------------------------------------------------------------------------
82
83/// Handle for accessing cached test-image datasets.
84///
85/// Create with [`Corpus::new()`] (default cache location) or
86/// [`Corpus::with_cache_root()`] (explicit path). Then call [`Corpus::get()`]
87/// to obtain the local path to a dataset — downloading it if necessary.
88pub struct Corpus {
89    root: PathBuf,
90}
91
92impl Corpus {
93    /// Initialize with default cache location.
94    ///
95    /// Resolves cache root via `CODEC_CORPUS_CACHE` env var, then
96    /// `dirs::cache_dir()`. Performs no I/O beyond directory creation.
97    pub fn new() -> Result<Self, Error> {
98        let base = if let Ok(val) = std::env::var("CODEC_CORPUS_CACHE") {
99            PathBuf::from(val)
100        } else {
101            dirs::cache_dir().ok_or(Error::NoCacheDir)?
102        };
103        Self::init(base)
104    }
105
106    /// Initialize with explicit cache root. Overrides the environment variable.
107    ///
108    /// Files will live at `{path}/codec-corpus/v{major}/`.
109    pub fn with_cache_root(path: impl Into<PathBuf>) -> Result<Self, Error> {
110        Self::init(path.into())
111    }
112
113    /// Get the local path to a corpus subdirectory, downloading if needed.
114    ///
115    /// `path` is any path into the repository (e.g. `"webp-conformance"`,
116    /// `"webp-conformance/valid"`, `"clic2025/training"`). The top-level
117    /// folder is the download unit — requesting any path under it fetches
118    /// the entire folder recursively.
119    ///
120    /// # Examples
121    ///
122    /// ```no_run
123    /// let corpus = codec_corpus::Corpus::new()?;
124    /// let webp = corpus.get("webp-conformance")?;
125    /// let valid = corpus.get("webp-conformance/valid")?;
126    /// let training = corpus.get("clic2025/training")?;
127    /// # Ok::<(), codec_corpus::Error>(())
128    /// ```
129    pub fn get(&self, path: &str) -> Result<PathBuf, Error> {
130        let top = top_level_folder(path);
131        let full_path = self.root.join(path);
132
133        // Fast path: version matches and directory exists
134        if self.version_matches() && full_path.exists() {
135            return Ok(full_path);
136        }
137
138        // Slow path: download the top-level folder
139        self.ensure_downloaded(top)?;
140
141        if full_path.exists() {
142            Ok(full_path)
143        } else {
144            Err(Error::PathNotFound {
145                path: path.to_string(),
146            })
147        }
148    }
149
150    /// Check if a path is already cached locally, without downloading.
151    pub fn is_cached(&self, path: &str) -> bool {
152        self.version_matches() && self.root.join(path).exists()
153    }
154
155    /// List datasets currently cached on disk.
156    ///
157    /// Returns directory names under the cache root, excluding internal
158    /// files (`.version`, `.lock`, `.tmp-*`).
159    pub fn list_cached(&self) -> Vec<String> {
160        let mut datasets = Vec::new();
161        let Ok(entries) = std::fs::read_dir(&self.root) else {
162            return datasets;
163        };
164        for entry in entries.flatten() {
165            let name = entry.file_name();
166            let name_str = name.to_string_lossy();
167            if name_str.starts_with('.') {
168                continue;
169            }
170            if entry.path().is_dir() {
171                datasets.push(name_str.into_owned());
172            }
173        }
174        datasets.sort();
175        datasets
176    }
177
178    // -----------------------------------------------------------------------
179    // Private helpers
180    // -----------------------------------------------------------------------
181
182    fn init(base: PathBuf) -> Result<Self, Error> {
183        let major = CRATE_VERSION
184            .split('.')
185            .next()
186            .unwrap_or("0");
187        let root = base.join("codec-corpus").join(format!("v{major}"));
188        std::fs::create_dir_all(&root).map_err(Error::Io)?;
189        Ok(Self { root })
190    }
191
192    fn version_matches(&self) -> bool {
193        let version_file = self.root.join(".version");
194        std::fs::read_to_string(&version_file)
195            .map(|v| v.trim() == CRATE_VERSION)
196            .unwrap_or(false)
197    }
198
199    /// Download the top-level folder that contains `folder`.
200    fn ensure_downloaded(&self, folder: &str) -> Result<(), Error> {
201        let lock_path = self.root.join(".lock");
202        let lock_file = std::fs::File::create(&lock_path).map_err(Error::Io)?;
203        let mut lock = fd_lock::RwLock::new(lock_file);
204        let _guard = lock.write().map_err(Error::Io)?;
205
206        // Re-check after acquiring lock — another process may have finished
207        if self.version_matches() && self.root.join(folder).is_dir() {
208            cleanup_old_temps(&self.root);
209            return Ok(());
210        }
211
212        // If version mismatch, we need to re-download
213        let need_version_reset = !self.version_matches();
214
215        if need_version_reset {
216            self.clear_datasets();
217        }
218
219        // Git sparse-checks out just the folder; HTTP downloads the
220        // root-folder tarball (which may include sibling paths).
221        let download_result = download::try_git_sparse_checkout(
222            &self.root,
223            folder,
224            CRATE_VERSION,
225            REPO_URL,
226        )
227        .or_else(|_| download::try_http_download(&self.root, folder, CRATE_VERSION));
228
229        cleanup_old_temps(&self.root);
230        download_result?;
231
232        write_version_file(&self.root, CRATE_VERSION)?;
233        Ok(())
234    }
235
236    fn clear_datasets(&self) {
237        if let Ok(entries) = std::fs::read_dir(&self.root) {
238            for entry in entries.flatten() {
239                let name = entry.file_name();
240                let name_str = name.to_string_lossy();
241                // Keep .lock and .tmp-* entries (temps cleaned separately)
242                if name_str == ".lock" || name_str.starts_with(".tmp-") {
243                    continue;
244                }
245                let path = entry.path();
246                if path.is_dir() {
247                    let _ = std::fs::remove_dir_all(&path);
248                } else {
249                    let _ = std::fs::remove_file(&path);
250                }
251            }
252        }
253    }
254}
255
256// ---------------------------------------------------------------------------
257// Free helpers
258// ---------------------------------------------------------------------------
259
260/// Extract the first path component (the root folder / download unit).
261fn top_level_folder(path: &str) -> &str {
262    path.split('/').next().unwrap_or(path)
263}
264
265/// Atomically write the version file (write to temp, then rename).
266fn write_version_file(root: &std::path::Path, version: &str) -> Result<(), Error> {
267    let version_file = root.join(".version");
268    let tmp = root.join(".version.tmp");
269    std::fs::write(&tmp, version).map_err(Error::Io)?;
270    std::fs::rename(&tmp, &version_file).map_err(Error::Io)?;
271    Ok(())
272}
273
274/// Remove `.tmp-*` entries older than 1 hour (orphaned from crashed runs).
275fn cleanup_old_temps(root: &PathBuf) {
276    let one_hour = Duration::from_secs(3600);
277    let Ok(entries) = std::fs::read_dir(root) else {
278        return;
279    };
280
281    for entry in entries.flatten() {
282        let name = entry.file_name();
283        let name_str = name.to_string_lossy();
284        if !name_str.starts_with(".tmp-") {
285            continue;
286        }
287        let Ok(meta) = entry.metadata() else {
288            continue;
289        };
290        let age = meta
291            .modified()
292            .ok()
293            .and_then(|t| t.elapsed().ok())
294            .unwrap_or_default();
295        if age > one_hour {
296            let path = entry.path();
297            if path.is_dir() {
298                let _ = std::fs::remove_dir_all(&path);
299            } else {
300                let _ = std::fs::remove_file(&path);
301            }
302        }
303    }
304}
305
306#[cfg(test)]
307mod tests {
308    use super::*;
309
310    #[test]
311    fn test_top_level_folder() {
312        assert_eq!(top_level_folder("webp-conformance"), "webp-conformance");
313        assert_eq!(top_level_folder("webp-conformance/valid"), "webp-conformance");
314        assert_eq!(top_level_folder("clic2025/training/subdir"), "clic2025");
315    }
316
317    #[test]
318    fn test_list_cached_empty() {
319        let tmp = std::env::temp_dir().join("codec-corpus-test-list-cached");
320        let _ = std::fs::remove_dir_all(&tmp);
321        let corpus = Corpus::with_cache_root(&tmp).unwrap();
322        assert!(corpus.list_cached().is_empty());
323        let _ = std::fs::remove_dir_all(tmp);
324    }
325
326    #[test]
327    fn test_list_cached_with_dirs() {
328        let tmp = std::env::temp_dir().join("codec-corpus-test-list-cached2");
329        let _ = std::fs::remove_dir_all(&tmp);
330        let corpus = Corpus::with_cache_root(&tmp).unwrap();
331        // Create fake dataset dirs
332        std::fs::create_dir_all(corpus.root.join("alpha")).unwrap();
333        std::fs::create_dir_all(corpus.root.join("beta")).unwrap();
334        // Hidden dirs should be excluded
335        std::fs::create_dir_all(corpus.root.join(".tmp-123")).unwrap();
336        let cached = corpus.list_cached();
337        assert_eq!(cached, vec!["alpha", "beta"]);
338        let _ = std::fs::remove_dir_all(tmp);
339    }
340
341    #[test]
342    fn test_unknown_dataset_downloads() {
343        // With no hardcoded list, any name is accepted (will fail at download)
344        let tmp = std::env::temp_dir().join("codec-corpus-test-any-name");
345        let _ = std::fs::remove_dir_all(&tmp);
346        let corpus = Corpus::with_cache_root(&tmp).unwrap();
347        let result = corpus.get("nonexistent-dataset");
348        // Should fail with NetworkUnavailable, not UnknownDataset
349        assert!(matches!(result, Err(Error::NetworkUnavailable { .. })));
350        let _ = std::fs::remove_dir_all(tmp);
351    }
352
353    #[test]
354    fn test_is_cached_empty() {
355        let tmp = std::env::temp_dir().join("codec-corpus-test-cached");
356        let _ = std::fs::remove_dir_all(&tmp);
357        let corpus = Corpus::with_cache_root(&tmp).unwrap();
358        assert!(!corpus.is_cached("webp-conformance"));
359        let _ = std::fs::remove_dir_all(tmp);
360    }
361
362    #[test]
363    fn test_version_matches() {
364        let tmp = std::env::temp_dir().join("codec-corpus-test-version");
365        let _ = std::fs::remove_dir_all(&tmp);
366        let corpus = Corpus::with_cache_root(&tmp).unwrap();
367        assert!(!corpus.version_matches());
368
369        write_version_file(&corpus.root, CRATE_VERSION).unwrap();
370        assert!(corpus.version_matches());
371
372        write_version_file(&corpus.root, "0.0.0-fake").unwrap();
373        assert!(!corpus.version_matches());
374
375        let _ = std::fs::remove_dir_all(tmp);
376    }
377}