1mod category;
22mod checksum;
23mod discovery;
24pub mod sparse;
25
26use std::path::{Path, PathBuf};
27
28use serde::{Deserialize, Serialize};
29
30pub use category::ImageCategory;
31pub use checksum::compute_checksum;
32pub use sparse::{SparseCheckout, SparseFilter, SparseStatus};
33
34use crate::error::Result;
35
36#[derive(Debug, Clone, Serialize, Deserialize)]
38pub struct Corpus {
39 pub name: String,
41
42 pub root_path: PathBuf,
44
45 pub images: Vec<CorpusImage>,
47
48 #[serde(default)]
50 pub metadata: CorpusMetadata,
51}
52
53#[derive(Debug, Clone, Default, Serialize, Deserialize)]
55pub struct CorpusMetadata {
56 pub description: Option<String>,
58
59 pub license: Option<String>,
61
62 pub source_url: Option<String>,
64
65 #[serde(default)]
67 pub category_counts: std::collections::HashMap<String, usize>,
68}
69
70#[derive(Debug, Clone, Serialize, Deserialize)]
72pub struct CorpusImage {
73 pub relative_path: PathBuf,
75
76 pub category: Option<ImageCategory>,
78
79 pub width: u32,
81 pub height: u32,
82
83 pub file_size: u64,
85
86 pub checksum: Option<String>,
88
89 pub format: String,
91}
92
93impl CorpusImage {
94 #[must_use]
96 pub fn full_path(&self, root: &Path) -> PathBuf {
97 root.join(&self.relative_path)
98 }
99
100 #[must_use]
102 pub fn name(&self) -> &str {
103 self.relative_path
104 .file_name()
105 .and_then(|s| s.to_str())
106 .unwrap_or("")
107 }
108
109 #[must_use]
111 pub fn pixel_count(&self) -> u64 {
112 u64::from(self.width) * u64::from(self.height)
113 }
114}
115
116impl Corpus {
117 #[must_use]
119 pub fn new(name: impl Into<String>, root_path: impl Into<PathBuf>) -> Self {
120 Self {
121 name: name.into(),
122 root_path: root_path.into(),
123 images: Vec::new(),
124 metadata: CorpusMetadata::default(),
125 }
126 }
127
128 pub fn discover(path: impl AsRef<Path>) -> Result<Self> {
133 discovery::discover_corpus(path.as_ref())
134 }
135
136 #[cfg(not(feature = "corpus"))]
138 pub const DEFAULT_CORPUS_URL: &'static str =
139 "https://github.com/imazen/codec-corpus.git";
140
141 #[cfg(feature = "corpus")]
158 pub fn get_dataset(dataset: &str) -> Result<Self> {
159 let corpus_api = codec_corpus::Corpus::new()
160 .map_err(|e| crate::Error::Corpus(format!("Failed to initialize corpus: {e}")))?;
161
162 let path = corpus_api
163 .get(dataset)
164 .map_err(|e| crate::Error::Corpus(format!("Failed to get dataset '{dataset}': {e}")))?;
165
166 eprintln!("Using corpus dataset '{}' at {}", dataset, path.display());
167 Self::discover(&path)
168 }
169
170 #[cfg(feature = "corpus")]
180 pub fn discover_or_download(
181 path: impl AsRef<Path>,
182 _url: Option<&str>,
183 _subsets: Option<&[&str]>,
184 ) -> Result<Self> {
185 let path = path.as_ref();
186
187 if path.exists() && path.is_dir() && has_image_files(path) {
189 return Self::discover(path);
190 }
191
192 Err(crate::Error::Corpus(format!(
193 "Path {} not found. Use Corpus::get_dataset() to download datasets automatically.",
194 path.display()
195 )))
196 }
197
198 #[cfg(not(feature = "corpus"))]
203 pub fn discover_or_download(
204 path: impl AsRef<Path>,
205 url: Option<&str>,
206 subsets: Option<&[&str]>,
207 ) -> Result<Self> {
208 let path = path.as_ref();
209 let url = url.unwrap_or(Self::DEFAULT_CORPUS_URL);
210
211 if path.exists() && path.is_dir() && has_image_files(path) {
213 return Self::discover(path);
214 }
215
216 eprintln!(
218 "Corpus not found at {}, downloading from {}",
219 path.display(),
220 url
221 );
222
223 let sparse = if let Some(subsets) = subsets {
225 let checkout = SparseCheckout::clone_shallow(url, path, 1)?;
226 let paths: Vec<&str> = subsets.to_vec();
227 checkout.add_paths(&paths)?;
228 checkout.checkout()?;
229 checkout
230 } else {
231 let checkout = SparseCheckout::clone_shallow(url, path, 1)?;
232 checkout.set_paths(&["*"])?;
233 checkout.checkout()?;
234 checkout
235 };
236
237 eprintln!("Downloaded corpus to {}", sparse.path().display());
238 Self::discover(path)
239 }
240
241 #[cfg(feature = "corpus")]
250 pub fn download_dataset(dataset: &str) -> Result<Self> {
251 Self::get_dataset(dataset)
252 }
253
254 #[cfg(not(feature = "corpus"))]
261 pub fn download_subset(path: impl AsRef<Path>, subset: &str) -> Result<Self> {
262 Self::discover_or_download(path, None, Some(&[subset]))
263 }
264
265 pub fn get_or_download(preferred_path: impl AsRef<Path>) -> Result<Self> {
276 let preferred = preferred_path.as_ref();
277
278 let candidates = [
280 preferred.to_path_buf(),
281 PathBuf::from("./codec-corpus"),
282 PathBuf::from("../codec-corpus"),
283 PathBuf::from("../codec-comparison/codec-corpus"),
284 ];
285
286 for path in &candidates {
287 if path.exists() && has_image_files(path) {
288 eprintln!("Found corpus at {}", path.display());
289 return Self::discover(path);
290 }
291 }
292
293 #[cfg(feature = "corpus")]
295 {
296 Err(crate::Error::Corpus(format!(
297 "Corpus not found at any common location. Use Corpus::get_dataset(\"kodak\") to download automatically."
298 )))
299 }
300
301 #[cfg(not(feature = "corpus"))]
302 {
303 Self::discover_or_download(preferred, None, None)
305 }
306 }
307
308 pub fn load(path: impl AsRef<Path>) -> Result<Self> {
310 let content = std::fs::read_to_string(path.as_ref())?;
311 let corpus: Corpus = serde_json::from_str(&content)?;
312 Ok(corpus)
313 }
314
315 pub fn save(&self, path: impl AsRef<Path>) -> Result<()> {
317 let content = serde_json::to_string_pretty(self)?;
318 std::fs::write(path.as_ref(), content)?;
319 Ok(())
320 }
321
322 #[must_use]
324 pub fn len(&self) -> usize {
325 self.images.len()
326 }
327
328 #[must_use]
330 pub fn is_empty(&self) -> bool {
331 self.images.is_empty()
332 }
333
334 #[must_use]
336 pub fn filter_category(&self, category: ImageCategory) -> Vec<&CorpusImage> {
337 self.images
338 .iter()
339 .filter(|img| img.category == Some(category))
340 .collect()
341 }
342
343 #[must_use]
345 pub fn filter_format(&self, format: &str) -> Vec<&CorpusImage> {
346 let format_lower = format.to_lowercase();
347 self.images
348 .iter()
349 .filter(|img| img.format.to_lowercase() == format_lower)
350 .collect()
351 }
352
353 #[must_use]
355 pub fn filter_min_size(&self, min_width: u32, min_height: u32) -> Vec<&CorpusImage> {
356 self.images
357 .iter()
358 .filter(|img| img.width >= min_width && img.height >= min_height)
359 .collect()
360 }
361
362 #[must_use]
370 pub fn split(&self, train_ratio: f64) -> (Vec<&CorpusImage>, Vec<&CorpusImage>) {
371 let train_ratio = train_ratio.clamp(0.0, 1.0);
372 let mut train = Vec::new();
373 let mut val = Vec::new();
374
375 for (i, img) in self.images.iter().enumerate() {
376 let hash = img.checksum.as_ref().map_or(i, |s| {
378 s.bytes()
379 .fold(0usize, |acc, b| acc.wrapping_add(b as usize))
380 });
381
382 if (hash % 1000) < (train_ratio * 1000.0) as usize {
383 train.push(img);
384 } else {
385 val.push(img);
386 }
387 }
388
389 (train, val)
390 }
391
392 pub fn compute_checksums(&mut self) -> Result<usize> {
394 let mut computed = 0;
395
396 for img in &mut self.images {
397 if img.checksum.is_none() {
398 let path = self.root_path.join(&img.relative_path);
399 if path.exists() {
400 img.checksum = Some(compute_checksum(&path)?);
401 computed += 1;
402 }
403 }
404 }
405
406 Ok(computed)
407 }
408
409 #[must_use]
411 pub fn find_duplicates(&self) -> Vec<Vec<&CorpusImage>> {
412 use std::collections::HashMap;
413
414 let mut by_checksum: HashMap<&str, Vec<&CorpusImage>> = HashMap::new();
415
416 for img in &self.images {
417 if let Some(ref checksum) = img.checksum {
418 by_checksum.entry(checksum).or_default().push(img);
419 }
420 }
421
422 by_checksum.into_values().filter(|v| v.len() > 1).collect()
423 }
424
425 pub fn update_category_counts(&mut self) {
427 self.metadata.category_counts.clear();
428
429 for img in &self.images {
430 if let Some(cat) = img.category {
431 *self
432 .metadata
433 .category_counts
434 .entry(cat.to_string())
435 .or_insert(0) += 1;
436 }
437 }
438 }
439
440 #[must_use]
442 pub fn stats(&self) -> CorpusStats {
443 let total_pixels: u64 = self.images.iter().map(|img| img.pixel_count()).sum();
444 let total_bytes: u64 = self.images.iter().map(|img| img.file_size).sum();
445
446 let widths: Vec<u32> = self.images.iter().map(|img| img.width).collect();
447 let heights: Vec<u32> = self.images.iter().map(|img| img.height).collect();
448
449 CorpusStats {
450 image_count: self.images.len(),
451 total_pixels,
452 total_bytes,
453 min_width: widths.iter().copied().min().unwrap_or(0),
454 max_width: widths.iter().copied().max().unwrap_or(0),
455 min_height: heights.iter().copied().min().unwrap_or(0),
456 max_height: heights.iter().copied().max().unwrap_or(0),
457 }
458 }
459}
460
461#[derive(Debug, Clone, Serialize, Deserialize)]
463pub struct CorpusStats {
464 pub image_count: usize,
466 pub total_pixels: u64,
468 pub total_bytes: u64,
470 pub min_width: u32,
472 pub max_width: u32,
474 pub min_height: u32,
476 pub max_height: u32,
478}
479
480fn has_image_files(path: &Path) -> bool {
482 const IMAGE_EXTENSIONS: &[&str] = &["png", "jpg", "jpeg", "webp", "avif", "jxl"];
483
484 if let Ok(entries) = std::fs::read_dir(path) {
485 for entry in entries.flatten() {
486 let entry_path = entry.path();
487 if entry_path.is_file() {
488 if let Some(ext) = entry_path.extension().and_then(|e| e.to_str()) {
489 if IMAGE_EXTENSIONS.contains(&ext.to_lowercase().as_str()) {
490 return true;
491 }
492 }
493 } else if entry_path.is_dir() {
494 if let Ok(sub_entries) = std::fs::read_dir(&entry_path) {
496 for sub_entry in sub_entries.flatten() {
497 let sub_path = sub_entry.path();
498 if sub_path.is_file() {
499 if let Some(ext) = sub_path.extension().and_then(|e| e.to_str()) {
500 if IMAGE_EXTENSIONS.contains(&ext.to_lowercase().as_str()) {
501 return true;
502 }
503 }
504 }
505 }
506 }
507 }
508 }
509 }
510 false
511}
512
513#[cfg(test)]
514mod tests {
515 use super::*;
516
517 #[test]
518 fn test_corpus_new() {
519 let corpus = Corpus::new("test", "/tmp/images");
520 assert_eq!(corpus.name, "test");
521 assert!(corpus.is_empty());
522 }
523
524 #[test]
525 fn test_corpus_image_name() {
526 let img = CorpusImage {
527 relative_path: PathBuf::from("subdir/image.png"),
528 category: None,
529 width: 100,
530 height: 100,
531 file_size: 1000,
532 checksum: None,
533 format: "png".to_string(),
534 };
535 assert_eq!(img.name(), "image.png");
536 }
537
538 #[test]
539 fn test_corpus_split() {
540 let mut corpus = Corpus::new("test", "/tmp");
541 for i in 0..100 {
542 corpus.images.push(CorpusImage {
543 relative_path: PathBuf::from(format!("img{i}.png")),
544 category: None,
545 width: 100,
546 height: 100,
547 file_size: 1000,
548 checksum: Some(format!("{i:016x}")),
550 format: "png".to_string(),
551 });
552 }
553
554 let (train, val) = corpus.split(0.8);
555 assert_eq!(train.len() + val.len(), 100);
557 }
558}