codec_eval/corpus/
mod.rs

1//! Corpus management for test image collections.
2//!
3//! This module provides tools for managing collections of test images,
4//! including discovery, categorization, and checksum-based deduplication.
5//!
6//! ## Example
7//!
8//! ```rust,ignore
9//! use codec_eval::corpus::Corpus;
10//!
11//! // Discover images in a directory
12//! let corpus = Corpus::discover("./test_images")?;
13//!
14//! // Filter by category
15//! let photos = corpus.filter_category(ImageCategory::Photo);
16//!
17//! // Get training/validation split
18//! let (train, val) = corpus.split(0.8);
19//! ```
20
21mod category;
22mod checksum;
23mod discovery;
24pub mod sparse;
25
26use std::path::{Path, PathBuf};
27
28use serde::{Deserialize, Serialize};
29
30pub use category::ImageCategory;
31pub use checksum::compute_checksum;
32pub use sparse::{SparseCheckout, SparseFilter, SparseStatus};
33
34use crate::error::Result;
35
36/// A corpus of test images.
37#[derive(Debug, Clone, Serialize, Deserialize)]
38pub struct Corpus {
39    /// Name of the corpus.
40    pub name: String,
41
42    /// Root path of the corpus.
43    pub root_path: PathBuf,
44
45    /// Images in the corpus.
46    pub images: Vec<CorpusImage>,
47
48    /// Metadata about the corpus.
49    #[serde(default)]
50    pub metadata: CorpusMetadata,
51}
52
53/// Metadata about a corpus.
54#[derive(Debug, Clone, Default, Serialize, Deserialize)]
55pub struct CorpusMetadata {
56    /// Description of the corpus.
57    pub description: Option<String>,
58
59    /// License information.
60    pub license: Option<String>,
61
62    /// Source URL.
63    pub source_url: Option<String>,
64
65    /// Number of images by category.
66    #[serde(default)]
67    pub category_counts: std::collections::HashMap<String, usize>,
68}
69
70/// An image in the corpus.
71#[derive(Debug, Clone, Serialize, Deserialize)]
72pub struct CorpusImage {
73    /// Relative path from corpus root.
74    pub relative_path: PathBuf,
75
76    /// Image category (if classified).
77    pub category: Option<ImageCategory>,
78
79    /// Image dimensions.
80    pub width: u32,
81    pub height: u32,
82
83    /// File size in bytes.
84    pub file_size: u64,
85
86    /// Content checksum (for deduplication).
87    pub checksum: Option<String>,
88
89    /// Format detected from file extension.
90    pub format: String,
91}
92
93impl CorpusImage {
94    /// Get the full path to the image.
95    #[must_use]
96    pub fn full_path(&self, root: &Path) -> PathBuf {
97        root.join(&self.relative_path)
98    }
99
100    /// Get the image name (filename without path).
101    #[must_use]
102    pub fn name(&self) -> &str {
103        self.relative_path
104            .file_name()
105            .and_then(|s| s.to_str())
106            .unwrap_or("")
107    }
108
109    /// Get pixel count.
110    #[must_use]
111    pub fn pixel_count(&self) -> u64 {
112        u64::from(self.width) * u64::from(self.height)
113    }
114}
115
116impl Corpus {
117    /// Create a new empty corpus.
118    #[must_use]
119    pub fn new(name: impl Into<String>, root_path: impl Into<PathBuf>) -> Self {
120        Self {
121            name: name.into(),
122            root_path: root_path.into(),
123            images: Vec::new(),
124            metadata: CorpusMetadata::default(),
125        }
126    }
127
128    /// Discover images in a directory.
129    ///
130    /// Recursively scans the directory for supported image formats
131    /// (PNG, JPEG, WebP, AVIF).
132    pub fn discover(path: impl AsRef<Path>) -> Result<Self> {
133        discovery::discover_corpus(path.as_ref())
134    }
135
136    /// Load a corpus from a JSON manifest file.
137    pub fn load(path: impl AsRef<Path>) -> Result<Self> {
138        let content = std::fs::read_to_string(path.as_ref())?;
139        let corpus: Corpus = serde_json::from_str(&content)?;
140        Ok(corpus)
141    }
142
143    /// Save the corpus to a JSON manifest file.
144    pub fn save(&self, path: impl AsRef<Path>) -> Result<()> {
145        let content = serde_json::to_string_pretty(self)?;
146        std::fs::write(path.as_ref(), content)?;
147        Ok(())
148    }
149
150    /// Get the number of images in the corpus.
151    #[must_use]
152    pub fn len(&self) -> usize {
153        self.images.len()
154    }
155
156    /// Check if the corpus is empty.
157    #[must_use]
158    pub fn is_empty(&self) -> bool {
159        self.images.is_empty()
160    }
161
162    /// Filter images by category.
163    #[must_use]
164    pub fn filter_category(&self, category: ImageCategory) -> Vec<&CorpusImage> {
165        self.images
166            .iter()
167            .filter(|img| img.category == Some(category))
168            .collect()
169    }
170
171    /// Filter images by format.
172    #[must_use]
173    pub fn filter_format(&self, format: &str) -> Vec<&CorpusImage> {
174        let format_lower = format.to_lowercase();
175        self.images
176            .iter()
177            .filter(|img| img.format.to_lowercase() == format_lower)
178            .collect()
179    }
180
181    /// Filter images by minimum dimensions.
182    #[must_use]
183    pub fn filter_min_size(&self, min_width: u32, min_height: u32) -> Vec<&CorpusImage> {
184        self.images
185            .iter()
186            .filter(|img| img.width >= min_width && img.height >= min_height)
187            .collect()
188    }
189
190    /// Split the corpus into training and validation sets.
191    ///
192    /// Uses a deterministic split based on checksum to ensure reproducibility.
193    ///
194    /// # Arguments
195    ///
196    /// * `train_ratio` - Fraction of images to include in training set (0.0-1.0).
197    #[must_use]
198    pub fn split(&self, train_ratio: f64) -> (Vec<&CorpusImage>, Vec<&CorpusImage>) {
199        let train_ratio = train_ratio.clamp(0.0, 1.0);
200        let mut train = Vec::new();
201        let mut val = Vec::new();
202
203        for (i, img) in self.images.iter().enumerate() {
204            // Use checksum if available, otherwise use index
205            let hash = img.checksum.as_ref().map_or(i, |s| {
206                s.bytes()
207                    .fold(0usize, |acc, b| acc.wrapping_add(b as usize))
208            });
209
210            if (hash % 1000) < (train_ratio * 1000.0) as usize {
211                train.push(img);
212            } else {
213                val.push(img);
214            }
215        }
216
217        (train, val)
218    }
219
220    /// Compute checksums for all images that don't have them.
221    pub fn compute_checksums(&mut self) -> Result<usize> {
222        let mut computed = 0;
223
224        for img in &mut self.images {
225            if img.checksum.is_none() {
226                let path = self.root_path.join(&img.relative_path);
227                if path.exists() {
228                    img.checksum = Some(compute_checksum(&path)?);
229                    computed += 1;
230                }
231            }
232        }
233
234        Ok(computed)
235    }
236
237    /// Find duplicate images by checksum.
238    #[must_use]
239    pub fn find_duplicates(&self) -> Vec<Vec<&CorpusImage>> {
240        use std::collections::HashMap;
241
242        let mut by_checksum: HashMap<&str, Vec<&CorpusImage>> = HashMap::new();
243
244        for img in &self.images {
245            if let Some(ref checksum) = img.checksum {
246                by_checksum.entry(checksum).or_default().push(img);
247            }
248        }
249
250        by_checksum.into_values().filter(|v| v.len() > 1).collect()
251    }
252
253    /// Update category counts in metadata.
254    pub fn update_category_counts(&mut self) {
255        self.metadata.category_counts.clear();
256
257        for img in &self.images {
258            if let Some(cat) = img.category {
259                *self
260                    .metadata
261                    .category_counts
262                    .entry(cat.to_string())
263                    .or_insert(0) += 1;
264            }
265        }
266    }
267
268    /// Get statistics about the corpus.
269    #[must_use]
270    pub fn stats(&self) -> CorpusStats {
271        let total_pixels: u64 = self.images.iter().map(|img| img.pixel_count()).sum();
272        let total_bytes: u64 = self.images.iter().map(|img| img.file_size).sum();
273
274        let widths: Vec<u32> = self.images.iter().map(|img| img.width).collect();
275        let heights: Vec<u32> = self.images.iter().map(|img| img.height).collect();
276
277        CorpusStats {
278            image_count: self.images.len(),
279            total_pixels,
280            total_bytes,
281            min_width: widths.iter().copied().min().unwrap_or(0),
282            max_width: widths.iter().copied().max().unwrap_or(0),
283            min_height: heights.iter().copied().min().unwrap_or(0),
284            max_height: heights.iter().copied().max().unwrap_or(0),
285        }
286    }
287}
288
289/// Statistics about a corpus.
290#[derive(Debug, Clone, Serialize, Deserialize)]
291pub struct CorpusStats {
292    /// Number of images.
293    pub image_count: usize,
294    /// Total pixels across all images.
295    pub total_pixels: u64,
296    /// Total file size in bytes.
297    pub total_bytes: u64,
298    /// Minimum image width.
299    pub min_width: u32,
300    /// Maximum image width.
301    pub max_width: u32,
302    /// Minimum image height.
303    pub min_height: u32,
304    /// Maximum image height.
305    pub max_height: u32,
306}
307
308#[cfg(test)]
309mod tests {
310    use super::*;
311
312    #[test]
313    fn test_corpus_new() {
314        let corpus = Corpus::new("test", "/tmp/images");
315        assert_eq!(corpus.name, "test");
316        assert!(corpus.is_empty());
317    }
318
319    #[test]
320    fn test_corpus_image_name() {
321        let img = CorpusImage {
322            relative_path: PathBuf::from("subdir/image.png"),
323            category: None,
324            width: 100,
325            height: 100,
326            file_size: 1000,
327            checksum: None,
328            format: "png".to_string(),
329        };
330        assert_eq!(img.name(), "image.png");
331    }
332
333    #[test]
334    fn test_corpus_split() {
335        let mut corpus = Corpus::new("test", "/tmp");
336        for i in 0..100 {
337            corpus.images.push(CorpusImage {
338                relative_path: PathBuf::from(format!("img{i}.png")),
339                category: None,
340                width: 100,
341                height: 100,
342                file_size: 1000,
343                // Use varied checksums to get good distribution
344                checksum: Some(format!("{i:016x}")),
345                format: "png".to_string(),
346            });
347        }
348
349        let (train, val) = corpus.split(0.8);
350        // Should split all images
351        assert_eq!(train.len() + val.len(), 100);
352    }
353}