Skip to main content

codec_eval/corpus/
sparse.rs

1//! Sparse checkout utilities for partial corpus downloads.
2//!
3//! This module provides tools for working with git sparse checkout,
4//! allowing you to download only specific files from a corpus repository.
5//!
6//! ## Example
7//!
8//! ```rust,ignore
9//! use codec_eval::corpus::sparse::{SparseCheckout, SparseFilter};
10//!
11//! // Clone with sparse checkout
12//! let sparse = SparseCheckout::clone(
13//!     "https://github.com/example/codec-corpus.git",
14//!     "./corpus",
15//! )?;
16//!
17//! // Add specific paths
18//! sparse.add_paths(&["images/photos/*.png", "images/screenshots/"])?;
19//!
20//! // Or use filters
21//! sparse.add_filter(SparseFilter::Category("photos"))?;
22//! sparse.add_filter(SparseFilter::Format("png"))?;
23//!
24//! // Fetch the files
25//! sparse.fetch()?;
26//! ```
27
28use std::path::{Path, PathBuf};
29use std::process::Command;
30
31use crate::error::{Error, Result};
32
33/// Sparse checkout manager for git repositories.
34#[derive(Debug)]
35pub struct SparseCheckout {
36    /// Local repository path.
37    repo_path: PathBuf,
38    /// Remote URL (if cloned).
39    remote_url: Option<String>,
40}
41
42/// Filter for selecting files in sparse checkout.
43#[derive(Debug, Clone)]
44pub enum SparseFilter {
45    /// Match files by glob pattern.
46    Pattern(String),
47    /// Match by directory path.
48    Directory(String),
49    /// Match by file extension/format.
50    Format(String),
51    /// Match by category directory name.
52    Category(String),
53    /// Match files by minimum dimensions (requires manifest).
54    MinSize { width: u32, height: u32 },
55    /// Match specific file paths.
56    Paths(Vec<String>),
57}
58
59impl SparseFilter {
60    /// Convert filter to sparse-checkout patterns.
61    pub fn to_patterns(&self) -> Vec<String> {
62        match self {
63            Self::Pattern(p) => vec![p.clone()],
64            Self::Directory(d) => {
65                let d = d.trim_end_matches('/');
66                vec![format!("{d}/"), format!("{d}/**")]
67            }
68            Self::Format(ext) => {
69                let ext = ext.trim_start_matches('.');
70                vec![format!("**/*.{ext}")]
71            }
72            Self::Category(cat) => {
73                vec![
74                    format!("**/{cat}/"),
75                    format!("**/{cat}/**"),
76                    format!("{cat}/"),
77                    format!("{cat}/**"),
78                ]
79            }
80            Self::MinSize { .. } => {
81                // MinSize requires manifest lookup, return all and filter later
82                vec!["**/*".to_string()]
83            }
84            Self::Paths(paths) => paths.clone(),
85        }
86    }
87}
88
89impl SparseCheckout {
90    /// Initialize sparse checkout in an existing repository.
91    pub fn init(repo_path: impl AsRef<Path>) -> Result<Self> {
92        let repo_path = repo_path.as_ref().to_path_buf();
93
94        // Enable sparse checkout
95        run_git(&repo_path, &["sparse-checkout", "init", "--cone"])?;
96
97        Ok(Self {
98            repo_path,
99            remote_url: None,
100        })
101    }
102
103    /// Clone a repository with sparse checkout enabled.
104    pub fn clone(url: &str, target: impl AsRef<Path>) -> Result<Self> {
105        let target = target.as_ref();
106
107        // Create parent directory if needed
108        if let Some(parent) = target.parent() {
109            std::fs::create_dir_all(parent)?;
110        }
111
112        // Clone with sparse checkout and no checkout initially
113        run_git_cwd(
114            target.parent().unwrap_or(Path::new(".")),
115            &[
116                "clone",
117                "--filter=blob:none",
118                "--sparse",
119                "--no-checkout",
120                url,
121                &target.file_name().unwrap().to_string_lossy(),
122            ],
123        )?;
124
125        // Initialize sparse checkout
126        run_git(target, &["sparse-checkout", "init", "--cone"])?;
127
128        Ok(Self {
129            repo_path: target.to_path_buf(),
130            remote_url: Some(url.to_string()),
131        })
132    }
133
134    /// Clone with depth limit for faster initial clone.
135    pub fn clone_shallow(url: &str, target: impl AsRef<Path>, depth: u32) -> Result<Self> {
136        let target = target.as_ref();
137
138        if let Some(parent) = target.parent() {
139            std::fs::create_dir_all(parent)?;
140        }
141
142        run_git_cwd(
143            target.parent().unwrap_or(Path::new(".")),
144            &[
145                "clone",
146                "--filter=blob:none",
147                "--sparse",
148                "--no-checkout",
149                "--depth",
150                &depth.to_string(),
151                url,
152                &target.file_name().unwrap().to_string_lossy(),
153            ],
154        )?;
155
156        run_git(target, &["sparse-checkout", "init", "--cone"])?;
157
158        Ok(Self {
159            repo_path: target.to_path_buf(),
160            remote_url: Some(url.to_string()),
161        })
162    }
163
164    /// Open an existing sparse checkout repository.
165    pub fn open(repo_path: impl AsRef<Path>) -> Result<Self> {
166        let repo_path = repo_path.as_ref().to_path_buf();
167
168        if !repo_path.join(".git").exists() {
169            return Err(Error::Corpus(format!(
170                "Not a git repository: {}",
171                repo_path.display()
172            )));
173        }
174
175        // Get remote URL if available
176        let remote_url = run_git(&repo_path, &["remote", "get-url", "origin"]).ok();
177
178        Ok(Self {
179            repo_path,
180            remote_url,
181        })
182    }
183
184    /// Get the repository path.
185    #[must_use]
186    pub fn path(&self) -> &Path {
187        &self.repo_path
188    }
189
190    /// Get the remote URL.
191    #[must_use]
192    pub fn remote_url(&self) -> Option<&str> {
193        self.remote_url.as_deref()
194    }
195
196    /// Add paths to the sparse checkout.
197    pub fn add_paths(&self, paths: &[&str]) -> Result<()> {
198        let mut args = vec!["sparse-checkout", "add"];
199        args.extend(paths);
200        run_git(&self.repo_path, &args)?;
201        Ok(())
202    }
203
204    /// Set the sparse checkout paths (replaces existing).
205    pub fn set_paths(&self, paths: &[&str]) -> Result<()> {
206        let mut args = vec!["sparse-checkout", "set"];
207        args.extend(paths);
208        run_git(&self.repo_path, &args)?;
209        Ok(())
210    }
211
212    /// Add a filter to the sparse checkout.
213    pub fn add_filter(&self, filter: &SparseFilter) -> Result<()> {
214        let patterns = filter.to_patterns();
215        let refs: Vec<&str> = patterns.iter().map(String::as_str).collect();
216        self.add_paths(&refs)
217    }
218
219    /// Set filters for the sparse checkout (replaces existing).
220    pub fn set_filters(&self, filters: &[SparseFilter]) -> Result<()> {
221        let patterns: Vec<String> = filters.iter().flat_map(|f| f.to_patterns()).collect();
222        let refs: Vec<&str> = patterns.iter().map(String::as_str).collect();
223        self.set_paths(&refs)
224    }
225
226    /// List current sparse checkout patterns.
227    pub fn list_patterns(&self) -> Result<Vec<String>> {
228        let output = run_git(&self.repo_path, &["sparse-checkout", "list"])?;
229        Ok(output.lines().map(String::from).collect())
230    }
231
232    /// Checkout the sparse files.
233    pub fn checkout(&self) -> Result<()> {
234        run_git(&self.repo_path, &["checkout"])?;
235        Ok(())
236    }
237
238    /// Checkout a specific branch or tag.
239    pub fn checkout_ref(&self, reference: &str) -> Result<()> {
240        run_git(&self.repo_path, &["checkout", reference])?;
241        Ok(())
242    }
243
244    /// Fetch updates from remote.
245    pub fn fetch(&self) -> Result<()> {
246        run_git(&self.repo_path, &["fetch", "--filter=blob:none"])?;
247        Ok(())
248    }
249
250    /// Pull updates (fetch + checkout).
251    pub fn pull(&self) -> Result<()> {
252        self.fetch()?;
253        run_git(&self.repo_path, &["pull"])?;
254        Ok(())
255    }
256
257    /// Disable sparse checkout (get all files).
258    pub fn disable(&self) -> Result<()> {
259        run_git(&self.repo_path, &["sparse-checkout", "disable"])?;
260        Ok(())
261    }
262
263    /// Re-enable sparse checkout.
264    pub fn reapply(&self) -> Result<()> {
265        run_git(&self.repo_path, &["sparse-checkout", "reapply"])?;
266        Ok(())
267    }
268
269    /// Get status of sparse checkout.
270    pub fn status(&self) -> Result<SparseStatus> {
271        // Check if sparse checkout is enabled
272        let config =
273            run_git(&self.repo_path, &["config", "core.sparseCheckout"]).unwrap_or_default();
274        let enabled = config.trim() == "true";
275
276        // Get patterns
277        let patterns = if enabled {
278            self.list_patterns().unwrap_or_default()
279        } else {
280            Vec::new()
281        };
282
283        // Count checked out files
284        let files_output = run_git(&self.repo_path, &["ls-files"])?;
285        let checked_out_files = files_output.lines().count();
286
287        // Count total files in repo (if available)
288        let total_files = run_git(&self.repo_path, &["ls-tree", "-r", "--name-only", "HEAD"])
289            .map(|o| o.lines().count())
290            .ok();
291
292        Ok(SparseStatus {
293            enabled,
294            patterns,
295            checked_out_files,
296            total_files,
297        })
298    }
299}
300
301/// Status of a sparse checkout repository.
302#[derive(Debug, Clone)]
303pub struct SparseStatus {
304    /// Whether sparse checkout is enabled.
305    pub enabled: bool,
306    /// Current sparse checkout patterns.
307    pub patterns: Vec<String>,
308    /// Number of files currently checked out.
309    pub checked_out_files: usize,
310    /// Total files in repository (if known).
311    pub total_files: Option<usize>,
312}
313
314impl SparseStatus {
315    /// Get the percentage of files checked out.
316    #[must_use]
317    pub fn percentage(&self) -> Option<f64> {
318        self.total_files.map(|total| {
319            if total == 0 {
320                100.0
321            } else {
322                (self.checked_out_files as f64 / total as f64) * 100.0
323            }
324        })
325    }
326}
327
328/// Run a git command in a repository.
329fn run_git(repo_path: &Path, args: &[&str]) -> Result<String> {
330    let output = Command::new("git")
331        .args(["-C", &repo_path.to_string_lossy()])
332        .args(args)
333        .output()
334        .map_err(|e| Error::Corpus(format!("Failed to run git: {e}")))?;
335
336    if !output.status.success() {
337        let stderr = String::from_utf8_lossy(&output.stderr);
338        return Err(Error::Corpus(format!(
339            "git {} failed: {}",
340            args.join(" "),
341            stderr.trim()
342        )));
343    }
344
345    Ok(String::from_utf8_lossy(&output.stdout).trim().to_string())
346}
347
348/// Run a git command in a specific directory.
349fn run_git_cwd(cwd: &Path, args: &[&str]) -> Result<String> {
350    let output = Command::new("git")
351        .current_dir(cwd)
352        .args(args)
353        .output()
354        .map_err(|e| Error::Corpus(format!("Failed to run git: {e}")))?;
355
356    if !output.status.success() {
357        let stderr = String::from_utf8_lossy(&output.stderr);
358        return Err(Error::Corpus(format!(
359            "git {} failed: {}",
360            args.join(" "),
361            stderr.trim()
362        )));
363    }
364
365    Ok(String::from_utf8_lossy(&output.stdout).trim().to_string())
366}
367
368/// List files that would be matched by patterns (without checking out).
369pub fn preview_patterns(repo_path: &Path, patterns: &[&str]) -> Result<Vec<String>> {
370    // Get all files in repo
371    let all_files = run_git(repo_path, &["ls-tree", "-r", "--name-only", "HEAD"])?;
372
373    let mut matched = Vec::new();
374
375    for file in all_files.lines() {
376        for pattern in patterns {
377            if matches_pattern(file, pattern) {
378                matched.push(file.to_string());
379                break;
380            }
381        }
382    }
383
384    Ok(matched)
385}
386
387/// Simple glob pattern matching.
388fn matches_pattern(path: &str, pattern: &str) -> bool {
389    // Handle common patterns
390    if pattern == "**/*" {
391        return true;
392    }
393
394    // Handle **/dir/** - match dir anywhere in path
395    if pattern.starts_with("**/") && pattern.ends_with("/**") {
396        let middle = pattern.trim_start_matches("**/").trim_end_matches("/**");
397        // Match as a directory component anywhere
398        return path.starts_with(&format!("{middle}/")) || path.contains(&format!("/{middle}/"));
399    }
400
401    if pattern.ends_with("/**") {
402        let prefix = pattern.trim_end_matches("/**");
403        return path.starts_with(prefix) || path.starts_with(&format!("{prefix}/"));
404    }
405
406    if pattern.ends_with('/') {
407        let dir = pattern.trim_end_matches('/');
408        return path.starts_with(dir) || path.contains(&format!("/{dir}/"));
409    }
410
411    if pattern.starts_with("**/") {
412        let suffix = pattern.trim_start_matches("**/");
413        if suffix.contains('*') {
414            // Handle **/*.ext
415            if let Some(ext) = suffix.strip_prefix("*.") {
416                return path.ends_with(&format!(".{ext}"));
417            }
418        }
419        return path.ends_with(suffix) || path.contains(&format!("/{suffix}"));
420    }
421
422    // Direct match or prefix match
423    path == pattern || path.starts_with(&format!("{pattern}/"))
424}
425
426#[cfg(test)]
427mod tests {
428    use super::*;
429
430    #[test]
431    fn test_filter_to_patterns() {
432        let filter = SparseFilter::Format("png".to_string());
433        assert_eq!(filter.to_patterns(), vec!["**/*.png"]);
434
435        let filter = SparseFilter::Category("photos".to_string());
436        let patterns = filter.to_patterns();
437        assert!(patterns.contains(&"**/photos/".to_string()));
438        assert!(patterns.contains(&"**/photos/**".to_string()));
439
440        let filter = SparseFilter::Directory("images/test".to_string());
441        let patterns = filter.to_patterns();
442        assert!(patterns.contains(&"images/test/".to_string()));
443    }
444
445    #[test]
446    fn test_matches_pattern() {
447        // Extension matching
448        assert!(matches_pattern("images/test.png", "**/*.png"));
449        assert!(!matches_pattern("images/test.jpg", "**/*.png"));
450
451        // Directory matching
452        assert!(matches_pattern("photos/image.png", "photos/"));
453        assert!(matches_pattern("photos/sub/image.png", "photos/**"));
454
455        // Prefix matching
456        assert!(matches_pattern("images/photos/test.png", "**/photos/**"));
457    }
458
459    #[test]
460    fn test_sparse_status_percentage() {
461        let status = SparseStatus {
462            enabled: true,
463            patterns: vec![],
464            checked_out_files: 50,
465            total_files: Some(200),
466        };
467        assert!((status.percentage().unwrap() - 25.0).abs() < 0.01);
468    }
469}