1mod category;
22mod checksum;
23mod discovery;
24pub mod sparse;
25
26use std::path::{Path, PathBuf};
27
28use serde::{Deserialize, Serialize};
29
30pub use category::ImageCategory;
31pub use checksum::compute_checksum;
32pub use sparse::{SparseCheckout, SparseFilter, SparseStatus};
33
34use crate::error::Result;
35
36#[derive(Debug, Clone, Serialize, Deserialize)]
38pub struct Corpus {
39 pub name: String,
41
42 pub root_path: PathBuf,
44
45 pub images: Vec<CorpusImage>,
47
48 #[serde(default)]
50 pub metadata: CorpusMetadata,
51}
52
53#[derive(Debug, Clone, Default, Serialize, Deserialize)]
55pub struct CorpusMetadata {
56 pub description: Option<String>,
58
59 pub license: Option<String>,
61
62 pub source_url: Option<String>,
64
65 #[serde(default)]
67 pub category_counts: std::collections::HashMap<String, usize>,
68}
69
70#[derive(Debug, Clone, Serialize, Deserialize)]
72pub struct CorpusImage {
73 pub relative_path: PathBuf,
75
76 pub category: Option<ImageCategory>,
78
79 pub width: u32,
81 pub height: u32,
82
83 pub file_size: u64,
85
86 pub checksum: Option<String>,
88
89 pub format: String,
91}
92
93impl CorpusImage {
94 #[must_use]
96 pub fn full_path(&self, root: &Path) -> PathBuf {
97 root.join(&self.relative_path)
98 }
99
100 #[must_use]
102 pub fn name(&self) -> &str {
103 self.relative_path
104 .file_name()
105 .and_then(|s| s.to_str())
106 .unwrap_or("")
107 }
108
109 #[must_use]
111 pub fn pixel_count(&self) -> u64 {
112 u64::from(self.width) * u64::from(self.height)
113 }
114}
115
116impl Corpus {
117 #[must_use]
119 pub fn new(name: impl Into<String>, root_path: impl Into<PathBuf>) -> Self {
120 Self {
121 name: name.into(),
122 root_path: root_path.into(),
123 images: Vec::new(),
124 metadata: CorpusMetadata::default(),
125 }
126 }
127
128 pub fn discover(path: impl AsRef<Path>) -> Result<Self> {
133 discovery::discover_corpus(path.as_ref())
134 }
135
136 pub fn load(path: impl AsRef<Path>) -> Result<Self> {
138 let content = std::fs::read_to_string(path.as_ref())?;
139 let corpus: Corpus = serde_json::from_str(&content)?;
140 Ok(corpus)
141 }
142
143 pub fn save(&self, path: impl AsRef<Path>) -> Result<()> {
145 let content = serde_json::to_string_pretty(self)?;
146 std::fs::write(path.as_ref(), content)?;
147 Ok(())
148 }
149
150 #[must_use]
152 pub fn len(&self) -> usize {
153 self.images.len()
154 }
155
156 #[must_use]
158 pub fn is_empty(&self) -> bool {
159 self.images.is_empty()
160 }
161
162 #[must_use]
164 pub fn filter_category(&self, category: ImageCategory) -> Vec<&CorpusImage> {
165 self.images
166 .iter()
167 .filter(|img| img.category == Some(category))
168 .collect()
169 }
170
171 #[must_use]
173 pub fn filter_format(&self, format: &str) -> Vec<&CorpusImage> {
174 let format_lower = format.to_lowercase();
175 self.images
176 .iter()
177 .filter(|img| img.format.to_lowercase() == format_lower)
178 .collect()
179 }
180
181 #[must_use]
183 pub fn filter_min_size(&self, min_width: u32, min_height: u32) -> Vec<&CorpusImage> {
184 self.images
185 .iter()
186 .filter(|img| img.width >= min_width && img.height >= min_height)
187 .collect()
188 }
189
190 #[must_use]
198 pub fn split(&self, train_ratio: f64) -> (Vec<&CorpusImage>, Vec<&CorpusImage>) {
199 let train_ratio = train_ratio.clamp(0.0, 1.0);
200 let mut train = Vec::new();
201 let mut val = Vec::new();
202
203 for (i, img) in self.images.iter().enumerate() {
204 let hash = img.checksum.as_ref().map_or(i, |s| {
206 s.bytes()
207 .fold(0usize, |acc, b| acc.wrapping_add(b as usize))
208 });
209
210 if (hash % 1000) < (train_ratio * 1000.0) as usize {
211 train.push(img);
212 } else {
213 val.push(img);
214 }
215 }
216
217 (train, val)
218 }
219
220 pub fn compute_checksums(&mut self) -> Result<usize> {
222 let mut computed = 0;
223
224 for img in &mut self.images {
225 if img.checksum.is_none() {
226 let path = self.root_path.join(&img.relative_path);
227 if path.exists() {
228 img.checksum = Some(compute_checksum(&path)?);
229 computed += 1;
230 }
231 }
232 }
233
234 Ok(computed)
235 }
236
237 #[must_use]
239 pub fn find_duplicates(&self) -> Vec<Vec<&CorpusImage>> {
240 use std::collections::HashMap;
241
242 let mut by_checksum: HashMap<&str, Vec<&CorpusImage>> = HashMap::new();
243
244 for img in &self.images {
245 if let Some(ref checksum) = img.checksum {
246 by_checksum.entry(checksum).or_default().push(img);
247 }
248 }
249
250 by_checksum.into_values().filter(|v| v.len() > 1).collect()
251 }
252
253 pub fn update_category_counts(&mut self) {
255 self.metadata.category_counts.clear();
256
257 for img in &self.images {
258 if let Some(cat) = img.category {
259 *self
260 .metadata
261 .category_counts
262 .entry(cat.to_string())
263 .or_insert(0) += 1;
264 }
265 }
266 }
267
268 #[must_use]
270 pub fn stats(&self) -> CorpusStats {
271 let total_pixels: u64 = self.images.iter().map(|img| img.pixel_count()).sum();
272 let total_bytes: u64 = self.images.iter().map(|img| img.file_size).sum();
273
274 let widths: Vec<u32> = self.images.iter().map(|img| img.width).collect();
275 let heights: Vec<u32> = self.images.iter().map(|img| img.height).collect();
276
277 CorpusStats {
278 image_count: self.images.len(),
279 total_pixels,
280 total_bytes,
281 min_width: widths.iter().copied().min().unwrap_or(0),
282 max_width: widths.iter().copied().max().unwrap_or(0),
283 min_height: heights.iter().copied().min().unwrap_or(0),
284 max_height: heights.iter().copied().max().unwrap_or(0),
285 }
286 }
287}
288
289#[derive(Debug, Clone, Serialize, Deserialize)]
291pub struct CorpusStats {
292 pub image_count: usize,
294 pub total_pixels: u64,
296 pub total_bytes: u64,
298 pub min_width: u32,
300 pub max_width: u32,
302 pub min_height: u32,
304 pub max_height: u32,
306}
307
308#[cfg(test)]
309mod tests {
310 use super::*;
311
312 #[test]
313 fn test_corpus_new() {
314 let corpus = Corpus::new("test", "/tmp/images");
315 assert_eq!(corpus.name, "test");
316 assert!(corpus.is_empty());
317 }
318
319 #[test]
320 fn test_corpus_image_name() {
321 let img = CorpusImage {
322 relative_path: PathBuf::from("subdir/image.png"),
323 category: None,
324 width: 100,
325 height: 100,
326 file_size: 1000,
327 checksum: None,
328 format: "png".to_string(),
329 };
330 assert_eq!(img.name(), "image.png");
331 }
332
333 #[test]
334 fn test_corpus_split() {
335 let mut corpus = Corpus::new("test", "/tmp");
336 for i in 0..100 {
337 corpus.images.push(CorpusImage {
338 relative_path: PathBuf::from(format!("img{i}.png")),
339 category: None,
340 width: 100,
341 height: 100,
342 file_size: 1000,
343 checksum: Some(format!("{i:016x}")),
345 format: "png".to_string(),
346 });
347 }
348
349 let (train, val) = corpus.split(0.8);
350 assert_eq!(train.len() + val.len(), 100);
352 }
353}