Skip to main content

context_builder/
file_utils.rs

1use ignore::{DirEntry, WalkBuilder, overrides::OverrideBuilder};
2use std::fs;
3use std::io::{self, Write};
4use std::path::{Path, PathBuf};
5
6/// Returns a numeric category for file relevance ordering.
7/// Lower numbers appear first in output. Categories:
8/// 0 = Project config + key docs (Cargo.toml, README.md, AGENTS.md, etc.)
9/// 1 = Source code (src/, lib/) — entry points sorted first within category
10/// 2 = Tests and benchmarks (tests/, benches/, test/, spec/)
11/// 3 = Documentation, scripts, and everything else
12/// 4 = Generated/lock files (Cargo.lock, package-lock.json, etc.)
13/// 5 = Build/CI infrastructure (.github/, .circleci/, Dockerfile, etc.)
14fn file_relevance_category(path: &Path, base_path: &Path) -> u8 {
15    let relative = path.strip_prefix(base_path).unwrap_or(path);
16    let rel_str = relative.to_string_lossy();
17
18    // Check filename for lockfiles first — these are lowest priority
19    if let Some(name) = relative.file_name().and_then(|n| n.to_str()) {
20        let lockfile_names = [
21            "Cargo.lock",
22            "package-lock.json",
23            "yarn.lock",
24            "pnpm-lock.yaml",
25            "Gemfile.lock",
26            "poetry.lock",
27            "composer.lock",
28            "go.sum",
29            "bun.lockb",
30            "flake.lock",
31        ];
32        if lockfile_names.contains(&name) {
33            return 5;
34        }
35
36        // Check for config/manifest files + key project docs — highest priority
37        let config_names = [
38            // Package manifests
39            "Cargo.toml",
40            "package.json",
41            "tsconfig.json",
42            "pyproject.toml",
43            "setup.py",
44            "setup.cfg",
45            "go.mod",
46            "Gemfile",
47            // Tool config
48            "context-builder.toml",
49            ".gitignore",
50            // Key project documentation (LLMs need these for context)
51            "README.md",
52            "README",
53            "README.txt",
54            "README.rst",
55            "AGENTS.md",
56            "CLAUDE.md",
57            "GEMINI.md",
58            "COPILOT.md",
59            "CONTRIBUTING.md",
60            "CHANGELOG.md",
61        ];
62        if config_names.contains(&name) {
63            return 0;
64        }
65    }
66
67    // Check path prefix for category
68    let first_component = relative
69        .components()
70        .next()
71        .and_then(|c| c.as_os_str().to_str())
72        .unwrap_or("");
73
74    match first_component {
75        "src" | "lib" | "crates" | "packages" | "internal" | "cmd" | "pkg" => {
76            // Check sub-components for test directories within source trees.
77            // e.g., src/tests/auth.rs should be cat 2 (tests), not cat 1 (source).
78            let sub_path = rel_str.as_ref();
79            if sub_path.contains("/tests/")
80                || sub_path.contains("/test/")
81                || sub_path.contains("/spec/")
82                || sub_path.contains("/__tests__/")
83                || sub_path.contains("/benches/")
84                || sub_path.contains("/benchmarks/")
85            {
86                2
87            } else {
88                1
89            }
90        }
91        "tests" | "test" | "spec" | "benches" | "benchmarks" | "__tests__" => 2,
92        "docs" | "doc" | "examples" | "scripts" | "tools" | "assets" => 3,
93        // Build/CI infrastructure — useful context but not core source
94        ".github" | ".circleci" | ".gitlab" | ".buildkite" => 4,
95        _ => {
96            // Check extensions for additional heuristics
97            if let Some(ext) = relative.extension().and_then(|e| e.to_str()) {
98                match ext {
99                    "rs" | "go" | "py" | "ts" | "js" | "java" | "c" | "cpp" | "h" | "hpp"
100                    | "rb" | "swift" | "kt" | "scala" | "ex" | "exs" | "zig" | "hs" => {
101                        // Source file not in a recognized dir — check if it's a test
102                        // Use path boundaries to avoid false positives (e.g., "contest.rs")
103                        if rel_str.contains("/test/")
104                            || rel_str.contains("/tests/")
105                            || rel_str.contains("/spec/")
106                            || rel_str.contains("/__tests__/")
107                            || rel_str.ends_with("_test.rs")
108                            || rel_str.ends_with("_test.go")
109                            || rel_str.ends_with("_spec.rb")
110                            || rel_str.ends_with(".test.ts")
111                            || rel_str.ends_with(".test.js")
112                            || rel_str.ends_with(".spec.ts")
113                            || rel_str.starts_with("test_")
114                        {
115                            2
116                        } else {
117                            1
118                        }
119                    }
120                    "md" | "txt" | "rst" | "adoc" => 3,
121                    _ => 1, // Unknown extension in root — treat as source
122                }
123            } else {
124                // Check for build-related root files without extensions
125                if let Some(
126                    "Makefile" | "CMakeLists.txt" | "Dockerfile" | "Containerfile" | "Justfile"
127                    | "Taskfile" | "Rakefile" | "Vagrantfile",
128                ) = relative.file_name().and_then(|n| n.to_str())
129                {
130                    4
131                } else {
132                    3 // No extension — docs/other
133                }
134            }
135        }
136    }
137}
138
139/// Returns a sub-priority for sorting within the same relevance category.
140/// Lower values appear first. Entry points (main, lib, mod) get priority 0,
141/// other files get priority 1. This ensures LLMs see architectural entry
142/// points before helper modules.
143fn file_entry_point_priority(path: &Path) -> u8 {
144    if let Some("main" | "lib" | "mod" | "index" | "app" | "__init__") =
145        path.file_stem().and_then(|s| s.to_str())
146    {
147        0
148    } else {
149        1
150    }
151}
152
153/// Collects all files to be processed using `ignore` crate for efficient traversal.
154///
155/// `auto_ignores` are runtime-computed exclusion patterns (e.g., the tool's own
156/// output file or cache directory). They are processed identically to user ignores
157/// but kept separate to avoid polluting user-facing configuration.
158pub fn collect_files(
159    base_path: &Path,
160    filters: &[String],
161    ignores: &[String],
162    auto_ignores: &[String],
163) -> io::Result<Vec<DirEntry>> {
164    let mut walker = WalkBuilder::new(base_path);
165    // By default, the "ignore" crate respects .gitignore and hidden files, so we don't need walker.hidden(false)
166
167    // Build overrides for custom ignore patterns
168    let mut override_builder = OverrideBuilder::new(base_path);
169    for pattern in ignores {
170        // Attention: Confusing pattern ahead!
171        // Add the pattern to the override builder with ! prefix to ignore matching files.
172        // In OverrideBuilder, patterns without ! are whitelist (include) patterns,
173        // while patterns with ! are ignore patterns.
174        let ignore_pattern = format!("!{}", pattern);
175        if let Err(e) = override_builder.add(&ignore_pattern) {
176            return Err(io::Error::new(
177                io::ErrorKind::InvalidInput,
178                format!("Invalid ignore pattern '{}': {}", pattern, e),
179            ));
180        }
181    }
182    // Apply auto-computed ignore patterns (output file, cache dir, etc.)
183    for pattern in auto_ignores {
184        let ignore_pattern = format!("!{}", pattern);
185        if let Err(e) = override_builder.add(&ignore_pattern) {
186            log::warn!("Skipping invalid auto-ignore pattern '{}': {}", pattern, e);
187        }
188    }
189    // Also, always ignore the config file itself
190    if let Err(e) = override_builder.add("!context-builder.toml") {
191        return Err(io::Error::new(
192            io::ErrorKind::InvalidInput,
193            format!("Failed to add config ignore: {}", e),
194        ));
195    }
196
197    // Hardcoded auto-ignores for common heavy directories that should NEVER be
198    // included, even when there's no .git directory (so .gitignore isn't read).
199    // Without these, projects missing .git can produce million-line outputs
200    // from dependency trees.
201    let default_ignores = [
202        "node_modules",
203        "__pycache__",
204        ".venv",
205        "venv",
206        ".tox",
207        ".mypy_cache",
208        ".pytest_cache",
209        ".ruff_cache",
210        "vendor",  // Go, PHP, Ruby
211        ".bundle", // Ruby
212        "bower_components",
213        ".next",       // Next.js build output
214        ".nuxt",       // Nuxt build output
215        ".svelte-kit", // SvelteKit build output
216        ".angular",    // Angular cache
217        "dist",        // Common build output
218        "build",       // Common build output
219        ".gradle",     // Gradle cache
220        ".cargo",      // Cargo registry cache
221    ];
222    for dir in &default_ignores {
223        let pattern = format!("!{}/**", dir);
224        if let Err(e) = override_builder.add(&pattern) {
225            log::warn!("Skipping invalid default-ignore '{}': {}", dir, e);
226        }
227    }
228
229    let overrides = override_builder.build().map_err(|e| {
230        io::Error::new(
231            io::ErrorKind::InvalidInput,
232            format!("Failed to build overrides: {}", e),
233        )
234    })?;
235    walker.overrides(overrides);
236
237    if !filters.is_empty() {
238        let mut type_builder = ignore::types::TypesBuilder::new();
239        type_builder.add_defaults();
240        for filter in filters {
241            let _ = type_builder.add(filter, &format!("*.{}", filter));
242            type_builder.select(filter);
243        }
244        let types = type_builder.build().unwrap();
245        walker.types(types);
246    }
247
248    let mut files: Vec<DirEntry> = walker
249        .build()
250        .filter_map(Result::ok)
251        .filter(|e| e.file_type().is_some_and(|ft| ft.is_file()))
252        .collect();
253
254    // Sort files by relevance category, then entry-point priority, then alphabetically.
255    // This puts config + docs first, then source code (entry points before helpers),
256    // then tests, then docs/other, then build/CI, then lockfiles.
257    // LLMs comprehend codebases better when core source appears before test scaffolding.
258    files.sort_by(|a, b| {
259        let cat_a = file_relevance_category(a.path(), base_path);
260        let cat_b = file_relevance_category(b.path(), base_path);
261        cat_a
262            .cmp(&cat_b)
263            .then_with(|| {
264                file_entry_point_priority(a.path()).cmp(&file_entry_point_priority(b.path()))
265            })
266            .then_with(|| a.path().cmp(b.path()))
267    });
268
269    Ok(files)
270}
271
272/// Asks for user confirmation if the number of files is large.
273pub fn confirm_processing(file_count: usize) -> io::Result<bool> {
274    if file_count > 100 {
275        print!(
276            "Warning: You're about to process {} files. This might take a while. Continue? [y/N] ",
277            file_count
278        );
279        io::stdout().flush()?;
280        let mut input = String::new();
281        io::stdin().read_line(&mut input)?;
282        if !input.trim().eq_ignore_ascii_case("y") {
283            return Ok(false);
284        }
285    }
286    Ok(true)
287}
288
289/// Asks for user confirmation to overwrite an existing file.
290pub fn confirm_overwrite(file_path: &str) -> io::Result<bool> {
291    print!("The file '{}' already exists. Overwrite? [y/N] ", file_path);
292    io::stdout().flush()?;
293    let mut input = String::new();
294    io::stdin().read_line(&mut input)?;
295
296    if input.trim().eq_ignore_ascii_case("y") {
297        Ok(true)
298    } else {
299        Ok(false)
300    }
301}
302
303pub fn find_latest_file(dir: &Path) -> io::Result<Option<PathBuf>> {
304    if !dir.is_dir() {
305        return Ok(None);
306    }
307
308    let mut latest_file = None;
309    let mut latest_time = std::time::SystemTime::UNIX_EPOCH;
310
311    for entry in fs::read_dir(dir)? {
312        let entry = entry?;
313        let path = entry.path();
314        if path.is_file() {
315            let metadata = fs::metadata(&path)?;
316            let modified = metadata.modified()?;
317            if modified > latest_time {
318                latest_time = modified;
319                latest_file = Some(path);
320            }
321        }
322    }
323
324    Ok(latest_file)
325}
326
327#[cfg(test)]
328mod tests {
329    use super::*;
330    use std::fs;
331    use std::path::Path;
332    use tempfile::tempdir;
333
334    fn to_rel_paths(mut entries: Vec<DirEntry>, base: &Path) -> Vec<String> {
335        entries.sort_by_key(|e| e.path().to_path_buf());
336        entries
337            .iter()
338            .map(|e| {
339                e.path()
340                    .strip_prefix(base)
341                    .unwrap()
342                    .to_string_lossy()
343                    .replace('\\', "/")
344            })
345            .collect()
346    }
347
348    #[test]
349    fn collect_files_respects_filters() {
350        let dir = tempdir().unwrap();
351        let base = dir.path();
352
353        // create files
354        fs::create_dir_all(base.join("src")).unwrap();
355        fs::create_dir_all(base.join("scripts")).unwrap();
356        fs::write(base.join("src").join("main.rs"), "fn main() {}").unwrap();
357        fs::write(base.join("Cargo.toml"), "[package]\nname=\"x\"").unwrap();
358        fs::write(base.join("README.md"), "# readme").unwrap();
359        fs::write(base.join("scripts").join("build.sh"), "#!/bin/sh\n").unwrap();
360
361        let filters = vec!["rs".to_string(), "toml".to_string()];
362        let ignores: Vec<String> = vec![];
363
364        let files = collect_files(base, &filters, &ignores, &[]).unwrap();
365        let relative_paths = to_rel_paths(files, base);
366
367        assert!(relative_paths.contains(&"src/main.rs".to_string()));
368        assert!(relative_paths.contains(&"Cargo.toml".to_string()));
369        assert!(!relative_paths.contains(&"README.md".to_string()));
370        assert!(!relative_paths.contains(&"scripts/build.sh".to_string()));
371    }
372
373    #[test]
374    fn collect_files_respects_ignores_for_dirs_and_files() {
375        let dir = tempdir().unwrap();
376        let base = dir.path();
377
378        fs::create_dir_all(base.join("src")).unwrap();
379        fs::create_dir_all(base.join("target")).unwrap();
380        fs::create_dir_all(base.join("node_modules")).unwrap();
381
382        fs::write(base.join("src").join("main.rs"), "fn main() {}").unwrap();
383        fs::write(base.join("target").join("artifact.txt"), "bin").unwrap();
384        fs::write(base.join("node_modules").join("pkg.js"), "console.log();").unwrap();
385        fs::write(base.join("README.md"), "# readme").unwrap();
386
387        let filters: Vec<String> = vec![];
388        let ignores: Vec<String> = vec!["target".into(), "node_modules".into(), "README.md".into()];
389
390        let files = collect_files(base, &filters, &ignores, &[]).unwrap();
391        let relative_paths = to_rel_paths(files, base);
392
393        assert!(relative_paths.contains(&"src/main.rs".to_string()));
394        assert!(!relative_paths.contains(&"target/artifact.txt".to_string()));
395        assert!(!relative_paths.contains(&"node_modules/pkg.js".to_string()));
396        assert!(!relative_paths.contains(&"README.md".to_string()));
397    }
398
399    #[test]
400    fn collect_files_handles_invalid_ignore_pattern() {
401        let dir = tempdir().unwrap();
402        let base = dir.path();
403
404        fs::create_dir_all(base.join("src")).unwrap();
405        fs::write(base.join("src").join("main.rs"), "fn main() {}").unwrap();
406
407        let filters: Vec<String> = vec![];
408        let ignores: Vec<String> = vec!["[".into()]; // Invalid regex pattern
409
410        let result = collect_files(base, &filters, &ignores, &[]);
411        assert!(result.is_err());
412        assert!(
413            result
414                .unwrap_err()
415                .to_string()
416                .contains("Invalid ignore pattern")
417        );
418    }
419
420    #[test]
421    fn collect_files_empty_directory() {
422        let dir = tempdir().unwrap();
423        let base = dir.path();
424
425        let filters: Vec<String> = vec![];
426        let ignores: Vec<String> = vec![];
427
428        let files = collect_files(base, &filters, &ignores, &[]).unwrap();
429        assert!(files.is_empty());
430    }
431
432    #[test]
433    fn collect_files_no_matching_filters() {
434        let dir = tempdir().unwrap();
435        let base = dir.path();
436
437        fs::write(base.join("README.md"), "# readme").unwrap();
438        fs::write(base.join("script.py"), "print('hello')").unwrap();
439
440        let filters = vec!["rs".to_string()]; // Only Rust files
441        let ignores: Vec<String> = vec![];
442
443        let files = collect_files(base, &filters, &ignores, &[]).unwrap();
444        assert!(files.is_empty());
445    }
446
447    #[test]
448    fn collect_files_ignores_config_file() {
449        let dir = tempdir().unwrap();
450        let base = dir.path();
451
452        fs::write(base.join("context-builder.toml"), "[config]").unwrap();
453        fs::write(base.join("other.toml"), "[other]").unwrap();
454
455        let filters: Vec<String> = vec![];
456        let ignores: Vec<String> = vec![];
457
458        let files = collect_files(base, &filters, &ignores, &[]).unwrap();
459        let relative_paths = to_rel_paths(files, base);
460
461        assert!(!relative_paths.contains(&"context-builder.toml".to_string()));
462        assert!(relative_paths.contains(&"other.toml".to_string()));
463    }
464
465    #[test]
466    fn confirm_processing_small_count() {
467        // Test that small file counts don't require confirmation
468        let result = confirm_processing(50);
469        assert!(result.is_ok());
470        assert!(result.unwrap());
471    }
472
473    #[test]
474    fn find_latest_file_empty_directory() {
475        let dir = tempdir().unwrap();
476        let result = find_latest_file(dir.path()).unwrap();
477        assert!(result.is_none());
478    }
479
480    #[test]
481    fn find_latest_file_nonexistent_directory() {
482        let dir = tempdir().unwrap();
483        let nonexistent = dir.path().join("nonexistent");
484        let result = find_latest_file(&nonexistent).unwrap();
485        assert!(result.is_none());
486    }
487
488    #[test]
489    fn find_latest_file_single_file() {
490        let dir = tempdir().unwrap();
491        let file_path = dir.path().join("test.txt");
492        fs::write(&file_path, "content").unwrap();
493
494        let result = find_latest_file(dir.path()).unwrap();
495        assert!(result.is_some());
496        assert_eq!(result.unwrap(), file_path);
497    }
498
499    #[test]
500    fn find_latest_file_multiple_files() {
501        let dir = tempdir().unwrap();
502
503        let file1 = dir.path().join("old.txt");
504        let file2 = dir.path().join("new.txt");
505
506        fs::write(&file1, "old content").unwrap();
507        std::thread::sleep(std::time::Duration::from_millis(10));
508        fs::write(&file2, "new content").unwrap();
509
510        let result = find_latest_file(dir.path()).unwrap();
511        assert!(result.is_some());
512        assert_eq!(result.unwrap(), file2);
513    }
514
515    #[test]
516    fn find_latest_file_ignores_directories() {
517        let dir = tempdir().unwrap();
518        let subdir = dir.path().join("subdir");
519        fs::create_dir(&subdir).unwrap();
520
521        let file_path = dir.path().join("test.txt");
522        fs::write(&file_path, "content").unwrap();
523
524        let result = find_latest_file(dir.path()).unwrap();
525        assert!(result.is_some());
526        assert_eq!(result.unwrap(), file_path);
527    }
528
529    #[test]
530    fn test_confirm_processing_requires_user_interaction() {
531        // This test verifies the function signature and basic logic for large file counts
532        // The actual user interaction cannot be tested in unit tests
533
534        // For file counts <= 100, should return Ok(true) without prompting
535        // This is already tested implicitly by the fact that small counts don't prompt
536
537        // For file counts > 100, the function would prompt user input
538        // We can't easily test this without mocking stdin, but we can verify
539        // that the function exists and has the expected signature
540        use std::io::Cursor;
541
542        // Create a mock stdin that simulates user typing "y"
543        let input = b"y\n";
544        let _ = Cursor::new(input);
545
546        // We can't easily override stdin in a unit test without complex setup,
547        // so we'll just verify the function exists and handles small counts
548        let result = confirm_processing(50);
549        assert!(result.is_ok());
550        assert!(result.unwrap());
551    }
552
553    #[test]
554    fn test_confirm_overwrite_function_exists() {
555        // Similar to confirm_processing, this function requires user interaction
556        // We can verify it exists and has the expected signature
557
558        // For testing purposes, we know this function prompts for user input
559        // and returns Ok(true) if user types "y" or "Y", Ok(false) otherwise
560
561        // The function signature should be:
562        // pub fn confirm_overwrite(file_path: &str) -> io::Result<bool>
563
564        // We can't easily test the interactive behavior without mocking stdin,
565        // but we can ensure the function compiles and has the right signature
566        let _: fn(&str) -> std::io::Result<bool> = confirm_overwrite;
567    }
568
569    #[test]
570    fn test_collect_files_handles_permission_errors() {
571        // Test what happens when we can't access a directory
572        // This is harder to test portably, but we can test with invalid patterns
573        let dir = tempdir().unwrap();
574        let base = dir.path();
575
576        // Test with a pattern that might cause issues
577        let filters: Vec<String> = vec![];
578        let ignores: Vec<String> = vec!["[invalid".into()]; // Incomplete bracket
579
580        let result = collect_files(base, &filters, &ignores, &[]);
581        assert!(result.is_err());
582    }
583
584    #[test]
585    fn test_find_latest_file_permission_error() {
586        // Test behavior when we can't read directory metadata
587        use std::path::Path;
588
589        // Test with a path that doesn't exist
590        let nonexistent = Path::new("/this/path/should/not/exist/anywhere");
591        let result = find_latest_file(nonexistent);
592
593        // Should return Ok(None) for non-existent directories
594        assert!(result.is_ok());
595        assert!(result.unwrap().is_none());
596    }
597
598    #[test]
599    fn test_collect_files_with_symlinks() {
600        // Test behavior with symbolic links (if supported on platform)
601        let dir = tempdir().unwrap();
602        let base = dir.path();
603
604        // Create a regular file
605        fs::write(base.join("regular.txt"), "content").unwrap();
606
607        // On Unix-like systems, try creating a symlink
608        #[cfg(unix)]
609        {
610            use std::os::unix::fs::symlink;
611            let _ = symlink("regular.txt", base.join("link.txt"));
612        }
613
614        // On Windows, symlinks require special privileges, so skip this part
615        #[cfg(windows)]
616        {
617            // Just create another regular file to test
618            fs::write(base.join("another.txt"), "content2").unwrap();
619        }
620
621        let filters: Vec<String> = vec![];
622        let ignores: Vec<String> = vec![];
623
624        let files = collect_files(base, &filters, &ignores, &[]).unwrap();
625        // Should find at least the regular file
626        assert!(!files.is_empty());
627    }
628}