1mod download;
20
21use std::path::PathBuf;
22use std::time::Duration;
23
24const REPO_URL: &str = "https://github.com/imazen/codec-corpus";
29const CRATE_VERSION: &str = env!("CARGO_PKG_VERSION");
30
31#[derive(Debug)]
37#[non_exhaustive]
38pub enum Error {
39 NetworkUnavailable { dataset: String },
41 PathNotFound { path: String },
43 Io(std::io::Error),
45 NoCacheDir,
47}
48
49impl std::fmt::Display for Error {
50 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
51 match self {
52 Error::NetworkUnavailable { dataset } => {
53 write!(f, "network unavailable and dataset '{dataset}' not cached")
54 }
55 Error::PathNotFound { path } => {
56 write!(f, "path not found: '{path}'")
57 }
58 Error::Io(e) => write!(f, "I/O error: {e}"),
59 Error::NoCacheDir => write!(f, "could not determine cache directory"),
60 }
61 }
62}
63
64impl std::error::Error for Error {
65 fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
66 match self {
67 Error::Io(e) => Some(e),
68 _ => None,
69 }
70 }
71}
72
73impl From<std::io::Error> for Error {
74 fn from(e: std::io::Error) -> Self {
75 Error::Io(e)
76 }
77}
78
79pub struct Corpus {
89 root: PathBuf,
90}
91
92impl Corpus {
93 pub fn new() -> Result<Self, Error> {
98 let base = if let Ok(val) = std::env::var("CODEC_CORPUS_CACHE") {
99 PathBuf::from(val)
100 } else {
101 dirs::cache_dir().ok_or(Error::NoCacheDir)?
102 };
103 Self::init(base)
104 }
105
106 pub fn with_cache_root(path: impl Into<PathBuf>) -> Result<Self, Error> {
110 Self::init(path.into())
111 }
112
113 pub fn get(&self, path: &str) -> Result<PathBuf, Error> {
130 let top = top_level_folder(path);
131 let full_path = self.root.join(path);
132
133 if self.version_matches() && full_path.exists() {
135 return Ok(full_path);
136 }
137
138 self.ensure_downloaded(top)?;
140
141 if full_path.exists() {
142 Ok(full_path)
143 } else {
144 Err(Error::PathNotFound {
145 path: path.to_string(),
146 })
147 }
148 }
149
150 pub fn is_cached(&self, path: &str) -> bool {
152 self.version_matches() && self.root.join(path).exists()
153 }
154
155 pub fn list_cached(&self) -> Vec<String> {
160 let mut datasets = Vec::new();
161 let Ok(entries) = std::fs::read_dir(&self.root) else {
162 return datasets;
163 };
164 for entry in entries.flatten() {
165 let name = entry.file_name();
166 let name_str = name.to_string_lossy();
167 if name_str.starts_with('.') {
168 continue;
169 }
170 if entry.path().is_dir() {
171 datasets.push(name_str.into_owned());
172 }
173 }
174 datasets.sort();
175 datasets
176 }
177
178 fn init(base: PathBuf) -> Result<Self, Error> {
183 let major = CRATE_VERSION
184 .split('.')
185 .next()
186 .unwrap_or("0");
187 let root = base.join("codec-corpus").join(format!("v{major}"));
188 std::fs::create_dir_all(&root).map_err(Error::Io)?;
189 Ok(Self { root })
190 }
191
192 fn version_matches(&self) -> bool {
193 let version_file = self.root.join(".version");
194 std::fs::read_to_string(&version_file)
195 .map(|v| v.trim() == CRATE_VERSION)
196 .unwrap_or(false)
197 }
198
199 fn ensure_downloaded(&self, folder: &str) -> Result<(), Error> {
201 let lock_path = self.root.join(".lock");
202 let lock_file = std::fs::File::create(&lock_path).map_err(Error::Io)?;
203 let mut lock = fd_lock::RwLock::new(lock_file);
204 let _guard = lock.write().map_err(Error::Io)?;
205
206 if self.version_matches() && self.root.join(folder).is_dir() {
208 cleanup_old_temps(&self.root);
209 return Ok(());
210 }
211
212 let need_version_reset = !self.version_matches();
214
215 if need_version_reset {
216 self.clear_datasets();
217 }
218
219 let download_result = download::try_git_sparse_checkout(
222 &self.root,
223 folder,
224 CRATE_VERSION,
225 REPO_URL,
226 )
227 .or_else(|_| download::try_http_download(&self.root, folder, CRATE_VERSION));
228
229 cleanup_old_temps(&self.root);
230 download_result?;
231
232 write_version_file(&self.root, CRATE_VERSION)?;
233 Ok(())
234 }
235
236 fn clear_datasets(&self) {
237 if let Ok(entries) = std::fs::read_dir(&self.root) {
238 for entry in entries.flatten() {
239 let name = entry.file_name();
240 let name_str = name.to_string_lossy();
241 if name_str == ".lock" || name_str.starts_with(".tmp-") {
243 continue;
244 }
245 let path = entry.path();
246 if path.is_dir() {
247 let _ = std::fs::remove_dir_all(&path);
248 } else {
249 let _ = std::fs::remove_file(&path);
250 }
251 }
252 }
253 }
254}
255
256fn top_level_folder(path: &str) -> &str {
262 path.split('/').next().unwrap_or(path)
263}
264
265fn write_version_file(root: &std::path::Path, version: &str) -> Result<(), Error> {
267 let version_file = root.join(".version");
268 let tmp = root.join(".version.tmp");
269 std::fs::write(&tmp, version).map_err(Error::Io)?;
270 std::fs::rename(&tmp, &version_file).map_err(Error::Io)?;
271 Ok(())
272}
273
274fn cleanup_old_temps(root: &PathBuf) {
276 let one_hour = Duration::from_secs(3600);
277 let Ok(entries) = std::fs::read_dir(root) else {
278 return;
279 };
280
281 for entry in entries.flatten() {
282 let name = entry.file_name();
283 let name_str = name.to_string_lossy();
284 if !name_str.starts_with(".tmp-") {
285 continue;
286 }
287 let Ok(meta) = entry.metadata() else {
288 continue;
289 };
290 let age = meta
291 .modified()
292 .ok()
293 .and_then(|t| t.elapsed().ok())
294 .unwrap_or_default();
295 if age > one_hour {
296 let path = entry.path();
297 if path.is_dir() {
298 let _ = std::fs::remove_dir_all(&path);
299 } else {
300 let _ = std::fs::remove_file(&path);
301 }
302 }
303 }
304}
305
306#[cfg(test)]
307mod tests {
308 use super::*;
309
310 #[test]
311 fn test_top_level_folder() {
312 assert_eq!(top_level_folder("webp-conformance"), "webp-conformance");
313 assert_eq!(top_level_folder("webp-conformance/valid"), "webp-conformance");
314 assert_eq!(top_level_folder("clic2025/training/subdir"), "clic2025");
315 }
316
317 #[test]
318 fn test_list_cached_empty() {
319 let tmp = std::env::temp_dir().join("codec-corpus-test-list-cached");
320 let _ = std::fs::remove_dir_all(&tmp);
321 let corpus = Corpus::with_cache_root(&tmp).unwrap();
322 assert!(corpus.list_cached().is_empty());
323 let _ = std::fs::remove_dir_all(tmp);
324 }
325
326 #[test]
327 fn test_list_cached_with_dirs() {
328 let tmp = std::env::temp_dir().join("codec-corpus-test-list-cached2");
329 let _ = std::fs::remove_dir_all(&tmp);
330 let corpus = Corpus::with_cache_root(&tmp).unwrap();
331 std::fs::create_dir_all(corpus.root.join("alpha")).unwrap();
333 std::fs::create_dir_all(corpus.root.join("beta")).unwrap();
334 std::fs::create_dir_all(corpus.root.join(".tmp-123")).unwrap();
336 let cached = corpus.list_cached();
337 assert_eq!(cached, vec!["alpha", "beta"]);
338 let _ = std::fs::remove_dir_all(tmp);
339 }
340
341 #[test]
342 fn test_unknown_dataset_downloads() {
343 let tmp = std::env::temp_dir().join("codec-corpus-test-any-name");
345 let _ = std::fs::remove_dir_all(&tmp);
346 let corpus = Corpus::with_cache_root(&tmp).unwrap();
347 let result = corpus.get("nonexistent-dataset");
348 assert!(matches!(result, Err(Error::NetworkUnavailable { .. })));
350 let _ = std::fs::remove_dir_all(tmp);
351 }
352
353 #[test]
354 fn test_is_cached_empty() {
355 let tmp = std::env::temp_dir().join("codec-corpus-test-cached");
356 let _ = std::fs::remove_dir_all(&tmp);
357 let corpus = Corpus::with_cache_root(&tmp).unwrap();
358 assert!(!corpus.is_cached("webp-conformance"));
359 let _ = std::fs::remove_dir_all(tmp);
360 }
361
362 #[test]
363 fn test_version_matches() {
364 let tmp = std::env::temp_dir().join("codec-corpus-test-version");
365 let _ = std::fs::remove_dir_all(&tmp);
366 let corpus = Corpus::with_cache_root(&tmp).unwrap();
367 assert!(!corpus.version_matches());
368
369 write_version_file(&corpus.root, CRATE_VERSION).unwrap();
370 assert!(corpus.version_matches());
371
372 write_version_file(&corpus.root, "0.0.0-fake").unwrap();
373 assert!(!corpus.version_matches());
374
375 let _ = std::fs::remove_dir_all(tmp);
376 }
377}