Skip to main content

codec_eval/corpus/
mod.rs

1//! Corpus management for test image collections.
2//!
3//! This module provides tools for managing collections of test images,
4//! including discovery, categorization, and checksum-based deduplication.
5//!
6//! ## Example
7//!
8//! ```rust,ignore
9//! use codec_eval::corpus::Corpus;
10//!
11//! // Discover images in a directory
12//! let corpus = Corpus::discover("./test_images")?;
13//!
14//! // Filter by category
15//! let photos = corpus.filter_category(ImageCategory::Photo);
16//!
17//! // Get training/validation split
18//! let (train, val) = corpus.split(0.8);
19//! ```
20
21mod category;
22mod checksum;
23mod discovery;
24pub mod sparse;
25
26use std::path::{Path, PathBuf};
27
28use serde::{Deserialize, Serialize};
29
30pub use category::ImageCategory;
31pub use checksum::compute_checksum;
32pub use sparse::{SparseCheckout, SparseFilter, SparseStatus};
33
34use crate::error::Result;
35
36/// A corpus of test images.
37#[derive(Debug, Clone, Serialize, Deserialize)]
38pub struct Corpus {
39    /// Name of the corpus.
40    pub name: String,
41
42    /// Root path of the corpus.
43    pub root_path: PathBuf,
44
45    /// Images in the corpus.
46    pub images: Vec<CorpusImage>,
47
48    /// Metadata about the corpus.
49    #[serde(default)]
50    pub metadata: CorpusMetadata,
51}
52
53/// Metadata about a corpus.
54#[derive(Debug, Clone, Default, Serialize, Deserialize)]
55pub struct CorpusMetadata {
56    /// Description of the corpus.
57    pub description: Option<String>,
58
59    /// License information.
60    pub license: Option<String>,
61
62    /// Source URL.
63    pub source_url: Option<String>,
64
65    /// Number of images by category.
66    #[serde(default)]
67    pub category_counts: std::collections::HashMap<String, usize>,
68}
69
70/// An image in the corpus.
71#[derive(Debug, Clone, Serialize, Deserialize)]
72pub struct CorpusImage {
73    /// Relative path from corpus root.
74    pub relative_path: PathBuf,
75
76    /// Image category (if classified).
77    pub category: Option<ImageCategory>,
78
79    /// Image dimensions.
80    pub width: u32,
81    pub height: u32,
82
83    /// File size in bytes.
84    pub file_size: u64,
85
86    /// Content checksum (for deduplication).
87    pub checksum: Option<String>,
88
89    /// Format detected from file extension.
90    pub format: String,
91}
92
93impl CorpusImage {
94    /// Get the full path to the image.
95    #[must_use]
96    pub fn full_path(&self, root: &Path) -> PathBuf {
97        root.join(&self.relative_path)
98    }
99
100    /// Get the image name (filename without path).
101    #[must_use]
102    pub fn name(&self) -> &str {
103        self.relative_path
104            .file_name()
105            .and_then(|s| s.to_str())
106            .unwrap_or("")
107    }
108
109    /// Get pixel count.
110    #[must_use]
111    pub fn pixel_count(&self) -> u64 {
112        u64::from(self.width) * u64::from(self.height)
113    }
114}
115
116impl Corpus {
117    /// Create a new empty corpus.
118    #[must_use]
119    pub fn new(name: impl Into<String>, root_path: impl Into<PathBuf>) -> Self {
120        Self {
121            name: name.into(),
122            root_path: root_path.into(),
123            images: Vec::new(),
124            metadata: CorpusMetadata::default(),
125        }
126    }
127
128    /// Discover images in a directory.
129    ///
130    /// Recursively scans the directory for supported image formats
131    /// (PNG, JPEG, WebP, AVIF).
132    pub fn discover(path: impl AsRef<Path>) -> Result<Self> {
133        discovery::discover_corpus(path.as_ref())
134    }
135
136    /// Default corpus repository URL (used when corpus feature is disabled).
137    #[cfg(not(feature = "corpus"))]
138    pub const DEFAULT_CORPUS_URL: &'static str =
139        "https://github.com/imazen/codec-corpus.git";
140
141    /// Get corpus dataset, downloading if necessary.
142    ///
143    /// When the `corpus` feature is enabled (default), uses the codec-corpus crate
144    /// for automatic download and caching. Otherwise, checks if the path exists locally.
145    ///
146    /// # Arguments
147    /// * `path` - Dataset path (e.g., "kodak", "clic2025/training")
148    ///
149    /// # Example
150    /// ```rust,ignore
151    /// // With corpus feature (default): downloads and caches automatically
152    /// let corpus = Corpus::get_dataset("kodak")?;
153    ///
154    /// // Discovers images in the cached directory
155    /// println!("Found {} images", corpus.len());
156    /// ```
157    #[cfg(feature = "corpus")]
158    pub fn get_dataset(dataset: &str) -> Result<Self> {
159        let corpus_api = codec_corpus::Corpus::new()
160            .map_err(|e| crate::Error::Corpus(format!("Failed to initialize corpus: {e}")))?;
161
162        let path = corpus_api
163            .get(dataset)
164            .map_err(|e| crate::Error::Corpus(format!("Failed to get dataset '{dataset}': {e}")))?;
165
166        eprintln!("Using corpus dataset '{}' at {}", dataset, path.display());
167        Self::discover(&path)
168    }
169
170    /// Discover or download a corpus on demand (legacy, requires local path).
171    ///
172    /// If the path exists, discovers images. Otherwise returns an error.
173    /// Use `get_dataset()` when the `corpus` feature is enabled for automatic downloads.
174    ///
175    /// # Arguments
176    /// * `path` - Local path for the corpus
177    /// * `_url` - Ignored (for backward compatibility)
178    /// * `_subsets` - Ignored (for backward compatibility)
179    #[cfg(feature = "corpus")]
180    pub fn discover_or_download(
181        path: impl AsRef<Path>,
182        _url: Option<&str>,
183        _subsets: Option<&[&str]>,
184    ) -> Result<Self> {
185        let path = path.as_ref();
186
187        // If path exists and has images, discover
188        if path.exists() && path.is_dir() && has_image_files(path) {
189            return Self::discover(path);
190        }
191
192        Err(crate::Error::Corpus(format!(
193            "Path {} not found. Use Corpus::get_dataset() to download datasets automatically.",
194            path.display()
195        )))
196    }
197
198    /// Discover or download a corpus on demand (sparse checkout fallback).
199    ///
200    /// This version uses git sparse checkout when the `corpus` feature is disabled.
201    /// For most users, the `corpus` feature (enabled by default) is recommended.
202    #[cfg(not(feature = "corpus"))]
203    pub fn discover_or_download(
204        path: impl AsRef<Path>,
205        url: Option<&str>,
206        subsets: Option<&[&str]>,
207    ) -> Result<Self> {
208        let path = path.as_ref();
209        let url = url.unwrap_or(Self::DEFAULT_CORPUS_URL);
210
211        // If path exists and has images, just discover
212        if path.exists() && path.is_dir() && has_image_files(path) {
213            return Self::discover(path);
214        }
215
216        // Need to download
217        eprintln!(
218            "Corpus not found at {}, downloading from {}",
219            path.display(),
220            url
221        );
222
223        // Use sparse checkout for efficiency
224        let sparse = if let Some(subsets) = subsets {
225            let checkout = SparseCheckout::clone_shallow(url, path, 1)?;
226            let paths: Vec<&str> = subsets.to_vec();
227            checkout.add_paths(&paths)?;
228            checkout.checkout()?;
229            checkout
230        } else {
231            let checkout = SparseCheckout::clone_shallow(url, path, 1)?;
232            checkout.set_paths(&["*"])?;
233            checkout.checkout()?;
234            checkout
235        };
236
237        eprintln!("Downloaded corpus to {}", sparse.path().display());
238        Self::discover(path)
239    }
240
241    /// Download a specific dataset (replaces download_subset).
242    ///
243    /// With the `corpus` feature enabled, this uses codec-corpus for caching.
244    ///
245    /// # Example
246    /// ```rust,ignore
247    /// let corpus = Corpus::download_dataset("kodak")?;
248    /// ```
249    #[cfg(feature = "corpus")]
250    pub fn download_dataset(dataset: &str) -> Result<Self> {
251        Self::get_dataset(dataset)
252    }
253
254    /// Download a specific subset of the corpus (sparse checkout fallback).
255    ///
256    /// # Example
257    /// ```rust,ignore
258    /// let corpus = Corpus::download_subset("./corpus", "kodak")?;
259    /// ```
260    #[cfg(not(feature = "corpus"))]
261    pub fn download_subset(path: impl AsRef<Path>, subset: &str) -> Result<Self> {
262        Self::discover_or_download(path, None, Some(&[subset]))
263    }
264
265    /// Get corpus from local paths (legacy method).
266    ///
267    /// Checks common locations for existing corpus:
268    /// 1. The specified path
269    /// 2. ./codec-corpus
270    /// 3. ../codec-corpus
271    /// 4. ../codec-comparison/codec-corpus
272    ///
273    /// When the `corpus` feature is enabled, use `get_dataset()` instead
274    /// for automatic download and caching.
275    pub fn get_or_download(preferred_path: impl AsRef<Path>) -> Result<Self> {
276        let preferred = preferred_path.as_ref();
277
278        // Check common locations
279        let candidates = [
280            preferred.to_path_buf(),
281            PathBuf::from("./codec-corpus"),
282            PathBuf::from("../codec-corpus"),
283            PathBuf::from("../codec-comparison/codec-corpus"),
284        ];
285
286        for path in &candidates {
287            if path.exists() && has_image_files(path) {
288                eprintln!("Found corpus at {}", path.display());
289                return Self::discover(path);
290            }
291        }
292
293        // Not found
294        #[cfg(feature = "corpus")]
295        {
296            Err(crate::Error::Corpus(format!(
297                "Corpus not found at any common location. Use Corpus::get_dataset(\"kodak\") to download automatically."
298            )))
299        }
300
301        #[cfg(not(feature = "corpus"))]
302        {
303            // Fallback to sparse checkout
304            Self::discover_or_download(preferred, None, None)
305        }
306    }
307
308    /// Load a corpus from a JSON manifest file.
309    pub fn load(path: impl AsRef<Path>) -> Result<Self> {
310        let content = std::fs::read_to_string(path.as_ref())?;
311        let corpus: Corpus = serde_json::from_str(&content)?;
312        Ok(corpus)
313    }
314
315    /// Save the corpus to a JSON manifest file.
316    pub fn save(&self, path: impl AsRef<Path>) -> Result<()> {
317        let content = serde_json::to_string_pretty(self)?;
318        std::fs::write(path.as_ref(), content)?;
319        Ok(())
320    }
321
322    /// Get the number of images in the corpus.
323    #[must_use]
324    pub fn len(&self) -> usize {
325        self.images.len()
326    }
327
328    /// Check if the corpus is empty.
329    #[must_use]
330    pub fn is_empty(&self) -> bool {
331        self.images.is_empty()
332    }
333
334    /// Filter images by category.
335    #[must_use]
336    pub fn filter_category(&self, category: ImageCategory) -> Vec<&CorpusImage> {
337        self.images
338            .iter()
339            .filter(|img| img.category == Some(category))
340            .collect()
341    }
342
343    /// Filter images by format.
344    #[must_use]
345    pub fn filter_format(&self, format: &str) -> Vec<&CorpusImage> {
346        let format_lower = format.to_lowercase();
347        self.images
348            .iter()
349            .filter(|img| img.format.to_lowercase() == format_lower)
350            .collect()
351    }
352
353    /// Filter images by minimum dimensions.
354    #[must_use]
355    pub fn filter_min_size(&self, min_width: u32, min_height: u32) -> Vec<&CorpusImage> {
356        self.images
357            .iter()
358            .filter(|img| img.width >= min_width && img.height >= min_height)
359            .collect()
360    }
361
362    /// Split the corpus into training and validation sets.
363    ///
364    /// Uses a deterministic split based on checksum to ensure reproducibility.
365    ///
366    /// # Arguments
367    ///
368    /// * `train_ratio` - Fraction of images to include in training set (0.0-1.0).
369    #[must_use]
370    pub fn split(&self, train_ratio: f64) -> (Vec<&CorpusImage>, Vec<&CorpusImage>) {
371        let train_ratio = train_ratio.clamp(0.0, 1.0);
372        let mut train = Vec::new();
373        let mut val = Vec::new();
374
375        for (i, img) in self.images.iter().enumerate() {
376            // Use checksum if available, otherwise use index
377            let hash = img.checksum.as_ref().map_or(i, |s| {
378                s.bytes()
379                    .fold(0usize, |acc, b| acc.wrapping_add(b as usize))
380            });
381
382            if (hash % 1000) < (train_ratio * 1000.0) as usize {
383                train.push(img);
384            } else {
385                val.push(img);
386            }
387        }
388
389        (train, val)
390    }
391
392    /// Compute checksums for all images that don't have them.
393    pub fn compute_checksums(&mut self) -> Result<usize> {
394        let mut computed = 0;
395
396        for img in &mut self.images {
397            if img.checksum.is_none() {
398                let path = self.root_path.join(&img.relative_path);
399                if path.exists() {
400                    img.checksum = Some(compute_checksum(&path)?);
401                    computed += 1;
402                }
403            }
404        }
405
406        Ok(computed)
407    }
408
409    /// Find duplicate images by checksum.
410    #[must_use]
411    pub fn find_duplicates(&self) -> Vec<Vec<&CorpusImage>> {
412        use std::collections::HashMap;
413
414        let mut by_checksum: HashMap<&str, Vec<&CorpusImage>> = HashMap::new();
415
416        for img in &self.images {
417            if let Some(ref checksum) = img.checksum {
418                by_checksum.entry(checksum).or_default().push(img);
419            }
420        }
421
422        by_checksum.into_values().filter(|v| v.len() > 1).collect()
423    }
424
425    /// Update category counts in metadata.
426    pub fn update_category_counts(&mut self) {
427        self.metadata.category_counts.clear();
428
429        for img in &self.images {
430            if let Some(cat) = img.category {
431                *self
432                    .metadata
433                    .category_counts
434                    .entry(cat.to_string())
435                    .or_insert(0) += 1;
436            }
437        }
438    }
439
440    /// Get statistics about the corpus.
441    #[must_use]
442    pub fn stats(&self) -> CorpusStats {
443        let total_pixels: u64 = self.images.iter().map(|img| img.pixel_count()).sum();
444        let total_bytes: u64 = self.images.iter().map(|img| img.file_size).sum();
445
446        let widths: Vec<u32> = self.images.iter().map(|img| img.width).collect();
447        let heights: Vec<u32> = self.images.iter().map(|img| img.height).collect();
448
449        CorpusStats {
450            image_count: self.images.len(),
451            total_pixels,
452            total_bytes,
453            min_width: widths.iter().copied().min().unwrap_or(0),
454            max_width: widths.iter().copied().max().unwrap_or(0),
455            min_height: heights.iter().copied().min().unwrap_or(0),
456            max_height: heights.iter().copied().max().unwrap_or(0),
457        }
458    }
459}
460
461/// Statistics about a corpus.
462#[derive(Debug, Clone, Serialize, Deserialize)]
463pub struct CorpusStats {
464    /// Number of images.
465    pub image_count: usize,
466    /// Total pixels across all images.
467    pub total_pixels: u64,
468    /// Total file size in bytes.
469    pub total_bytes: u64,
470    /// Minimum image width.
471    pub min_width: u32,
472    /// Maximum image width.
473    pub max_width: u32,
474    /// Minimum image height.
475    pub min_height: u32,
476    /// Maximum image height.
477    pub max_height: u32,
478}
479
480/// Check if a directory contains any image files.
481fn has_image_files(path: &Path) -> bool {
482    const IMAGE_EXTENSIONS: &[&str] = &["png", "jpg", "jpeg", "webp", "avif", "jxl"];
483
484    if let Ok(entries) = std::fs::read_dir(path) {
485        for entry in entries.flatten() {
486            let entry_path = entry.path();
487            if entry_path.is_file() {
488                if let Some(ext) = entry_path.extension().and_then(|e| e.to_str()) {
489                    if IMAGE_EXTENSIONS.contains(&ext.to_lowercase().as_str()) {
490                        return true;
491                    }
492                }
493            } else if entry_path.is_dir() {
494                // Check subdirectories recursively (but only one level deep for performance)
495                if let Ok(sub_entries) = std::fs::read_dir(&entry_path) {
496                    for sub_entry in sub_entries.flatten() {
497                        let sub_path = sub_entry.path();
498                        if sub_path.is_file() {
499                            if let Some(ext) = sub_path.extension().and_then(|e| e.to_str()) {
500                                if IMAGE_EXTENSIONS.contains(&ext.to_lowercase().as_str()) {
501                                    return true;
502                                }
503                            }
504                        }
505                    }
506                }
507            }
508        }
509    }
510    false
511}
512
513#[cfg(test)]
514mod tests {
515    use super::*;
516
517    #[test]
518    fn test_corpus_new() {
519        let corpus = Corpus::new("test", "/tmp/images");
520        assert_eq!(corpus.name, "test");
521        assert!(corpus.is_empty());
522    }
523
524    #[test]
525    fn test_corpus_image_name() {
526        let img = CorpusImage {
527            relative_path: PathBuf::from("subdir/image.png"),
528            category: None,
529            width: 100,
530            height: 100,
531            file_size: 1000,
532            checksum: None,
533            format: "png".to_string(),
534        };
535        assert_eq!(img.name(), "image.png");
536    }
537
538    #[test]
539    fn test_corpus_split() {
540        let mut corpus = Corpus::new("test", "/tmp");
541        for i in 0..100 {
542            corpus.images.push(CorpusImage {
543                relative_path: PathBuf::from(format!("img{i}.png")),
544                category: None,
545                width: 100,
546                height: 100,
547                file_size: 1000,
548                // Use varied checksums to get good distribution
549                checksum: Some(format!("{i:016x}")),
550                format: "png".to_string(),
551            });
552        }
553
554        let (train, val) = corpus.split(0.8);
555        // Should split all images
556        assert_eq!(train.len() + val.len(), 100);
557    }
558}