1#![forbid(unsafe_code)]
20
21mod download;
22
23use std::path::{Path, PathBuf};
24use std::time::Duration;
25
26const REPO_URL: &str = "https://github.com/imazen/codec-corpus";
31const CRATE_VERSION: &str = env!("CARGO_PKG_VERSION");
32
33#[derive(Debug)]
39#[non_exhaustive]
40pub enum Error {
41 NetworkUnavailable { dataset: String },
43 PathNotFound { path: String },
45 Io(std::io::Error),
47 NoCacheDir,
49}
50
51impl std::fmt::Display for Error {
52 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
53 match self {
54 Error::NetworkUnavailable { dataset } => {
55 write!(f, "network unavailable and dataset '{dataset}' not cached")
56 }
57 Error::PathNotFound { path } => {
58 write!(f, "path not found: '{path}'")
59 }
60 Error::Io(e) => write!(f, "I/O error: {e}"),
61 Error::NoCacheDir => write!(f, "could not determine cache directory"),
62 }
63 }
64}
65
66impl std::error::Error for Error {
67 fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
68 match self {
69 Error::Io(e) => Some(e),
70 _ => None,
71 }
72 }
73}
74
75impl From<std::io::Error> for Error {
76 fn from(e: std::io::Error) -> Self {
77 Error::Io(e)
78 }
79}
80
81pub struct Corpus {
91 root: PathBuf,
92}
93
94impl Corpus {
95 pub fn new() -> Result<Self, Error> {
100 let base = if let Ok(val) = std::env::var("CODEC_CORPUS_CACHE") {
101 PathBuf::from(val)
102 } else {
103 dirs::cache_dir().ok_or(Error::NoCacheDir)?
104 };
105 Self::init(base)
106 }
107
108 pub fn with_cache_root(path: impl Into<PathBuf>) -> Result<Self, Error> {
112 Self::init(path.into())
113 }
114
115 pub fn get(&self, path: &str) -> Result<PathBuf, Error> {
132 let top = top_level_folder(path);
133 let full_path = self.root.join(path);
134
135 if self.version_matches() && full_path.exists() {
137 return Ok(full_path);
138 }
139
140 self.ensure_downloaded(top)?;
142
143 if full_path.exists() {
144 Ok(full_path)
145 } else {
146 Err(Error::PathNotFound {
147 path: path.to_string(),
148 })
149 }
150 }
151
152 pub fn is_cached(&self, path: &str) -> bool {
154 self.version_matches() && self.root.join(path).exists()
155 }
156
157 pub fn list_cached(&self) -> Vec<String> {
162 let mut datasets = Vec::new();
163 let Ok(entries) = std::fs::read_dir(&self.root) else {
164 return datasets;
165 };
166 for entry in entries.flatten() {
167 let name = entry.file_name();
168 let name_str = name.to_string_lossy();
169 if name_str.starts_with('.') {
170 continue;
171 }
172 if entry.path().is_dir() {
173 datasets.push(name_str.into_owned());
174 }
175 }
176 datasets.sort();
177 datasets
178 }
179
180 fn init(base: PathBuf) -> Result<Self, Error> {
185 let major = CRATE_VERSION
186 .split('.')
187 .next()
188 .unwrap_or("0");
189 let root = base.join("codec-corpus").join(format!("v{major}"));
190 std::fs::create_dir_all(&root).map_err(Error::Io)?;
191 Ok(Self { root })
192 }
193
194 fn version_matches(&self) -> bool {
195 let version_file = self.root.join(".version");
196 std::fs::read_to_string(&version_file)
197 .map(|v| v.trim() == CRATE_VERSION)
198 .unwrap_or(false)
199 }
200
201 fn ensure_downloaded(&self, folder: &str) -> Result<(), Error> {
203 let lock_path = self.root.join(".lock");
204 let lock_file = std::fs::File::create(&lock_path).map_err(Error::Io)?;
205 let mut lock = fd_lock::RwLock::new(lock_file);
206 let _guard = lock.write().map_err(Error::Io)?;
207
208 if self.version_matches() && self.root.join(folder).is_dir() {
210 cleanup_old_temps(&self.root);
211 return Ok(());
212 }
213
214 let need_version_reset = !self.version_matches();
216
217 if need_version_reset {
218 self.clear_datasets();
219 }
220
221 let download_result = download::try_git_sparse_checkout(
224 &self.root,
225 folder,
226 CRATE_VERSION,
227 REPO_URL,
228 )
229 .or_else(|_| download::try_http_download(&self.root, folder, CRATE_VERSION));
230
231 cleanup_old_temps(&self.root);
232 download_result?;
233
234 write_version_file(&self.root, CRATE_VERSION)?;
235 Ok(())
236 }
237
238 fn clear_datasets(&self) {
239 if let Ok(entries) = std::fs::read_dir(&self.root) {
240 for entry in entries.flatten() {
241 let name = entry.file_name();
242 let name_str = name.to_string_lossy();
243 if name_str == ".lock" || name_str.starts_with(".tmp-") {
245 continue;
246 }
247 let path = entry.path();
248 if path.is_dir() {
249 let _ = std::fs::remove_dir_all(&path);
250 } else {
251 let _ = std::fs::remove_file(&path);
252 }
253 }
254 }
255 }
256}
257
258fn top_level_folder(path: &str) -> &str {
264 path.split('/').next().unwrap_or(path)
265}
266
267fn write_version_file(root: &std::path::Path, version: &str) -> Result<(), Error> {
269 let version_file = root.join(".version");
270 let tmp = root.join(".version.tmp");
271 std::fs::write(&tmp, version).map_err(Error::Io)?;
272 std::fs::rename(&tmp, &version_file).map_err(Error::Io)?;
273 Ok(())
274}
275
276fn cleanup_old_temps(root: &Path) {
278 let one_hour = Duration::from_secs(3600);
279 let Ok(entries) = std::fs::read_dir(root) else {
280 return;
281 };
282
283 for entry in entries.flatten() {
284 let name = entry.file_name();
285 let name_str = name.to_string_lossy();
286 if !name_str.starts_with(".tmp-") {
287 continue;
288 }
289 let Ok(meta) = entry.metadata() else {
290 continue;
291 };
292 let age = meta
293 .modified()
294 .ok()
295 .and_then(|t| t.elapsed().ok())
296 .unwrap_or_default();
297 if age > one_hour {
298 let path = entry.path();
299 if path.is_dir() {
300 let _ = std::fs::remove_dir_all(&path);
301 } else {
302 let _ = std::fs::remove_file(&path);
303 }
304 }
305 }
306}
307
308#[cfg(test)]
309mod tests {
310 use super::*;
311
312 #[test]
313 fn test_top_level_folder() {
314 assert_eq!(top_level_folder("webp-conformance"), "webp-conformance");
315 assert_eq!(top_level_folder("webp-conformance/valid"), "webp-conformance");
316 assert_eq!(top_level_folder("clic2025/training/subdir"), "clic2025");
317 }
318
319 #[test]
320 fn test_list_cached_empty() {
321 let tmp = std::env::temp_dir().join("codec-corpus-test-list-cached");
322 let _ = std::fs::remove_dir_all(&tmp);
323 let corpus = Corpus::with_cache_root(&tmp).unwrap();
324 assert!(corpus.list_cached().is_empty());
325 let _ = std::fs::remove_dir_all(tmp);
326 }
327
328 #[test]
329 fn test_list_cached_with_dirs() {
330 let tmp = std::env::temp_dir().join("codec-corpus-test-list-cached2");
331 let _ = std::fs::remove_dir_all(&tmp);
332 let corpus = Corpus::with_cache_root(&tmp).unwrap();
333 std::fs::create_dir_all(corpus.root.join("alpha")).unwrap();
335 std::fs::create_dir_all(corpus.root.join("beta")).unwrap();
336 std::fs::create_dir_all(corpus.root.join(".tmp-123")).unwrap();
338 let cached = corpus.list_cached();
339 assert_eq!(cached, vec!["alpha", "beta"]);
340 let _ = std::fs::remove_dir_all(tmp);
341 }
342
343 #[test]
344 fn test_unknown_dataset_downloads() {
345 let tmp = std::env::temp_dir().join("codec-corpus-test-any-name");
347 let _ = std::fs::remove_dir_all(&tmp);
348 let corpus = Corpus::with_cache_root(&tmp).unwrap();
349 let result = corpus.get("nonexistent-dataset");
350 assert!(matches!(result, Err(Error::NetworkUnavailable { .. })));
352 let _ = std::fs::remove_dir_all(tmp);
353 }
354
355 #[test]
356 fn test_is_cached_empty() {
357 let tmp = std::env::temp_dir().join("codec-corpus-test-cached");
358 let _ = std::fs::remove_dir_all(&tmp);
359 let corpus = Corpus::with_cache_root(&tmp).unwrap();
360 assert!(!corpus.is_cached("webp-conformance"));
361 let _ = std::fs::remove_dir_all(tmp);
362 }
363
364 #[test]
365 fn test_version_matches() {
366 let tmp = std::env::temp_dir().join("codec-corpus-test-version");
367 let _ = std::fs::remove_dir_all(&tmp);
368 let corpus = Corpus::with_cache_root(&tmp).unwrap();
369 assert!(!corpus.version_matches());
370
371 write_version_file(&corpus.root, CRATE_VERSION).unwrap();
372 assert!(corpus.version_matches());
373
374 write_version_file(&corpus.root, "0.0.0-fake").unwrap();
375 assert!(!corpus.version_matches());
376
377 let _ = std::fs::remove_dir_all(tmp);
378 }
379}