Skip to main content

context_builder/
file_utils.rs

1use ignore::{DirEntry, WalkBuilder, overrides::OverrideBuilder};
2use std::fs;
3use std::io::{self, Write};
4use std::path::{Path, PathBuf};
5
6/// Returns a numeric category for file relevance ordering.
7/// Lower numbers appear first in output. Categories:
8/// 0 = Project config + key docs (Cargo.toml, README.md, AGENTS.md, etc.)
9/// 1 = Source code (src/, lib/) — entry points sorted first within category
10/// 2 = Tests and benchmarks (tests/, benches/, test/, spec/)
11/// 3 = Documentation, scripts, and everything else
12/// 4 = Generated/lock files (Cargo.lock, package-lock.json, etc.)
13/// 5 = Build/CI infrastructure (.github/, .circleci/, Dockerfile, etc.)
14fn file_relevance_category(path: &Path, base_path: &Path) -> u8 {
15    let relative = path.strip_prefix(base_path).unwrap_or(path);
16    let rel_str = relative.to_string_lossy();
17
18    // Check filename for lockfiles first — these are lowest priority
19    if let Some(name) = relative.file_name().and_then(|n| n.to_str()) {
20        let lockfile_names = [
21            "Cargo.lock",
22            "package-lock.json",
23            "yarn.lock",
24            "pnpm-lock.yaml",
25            "Gemfile.lock",
26            "poetry.lock",
27            "composer.lock",
28            "go.sum",
29            "bun.lockb",
30            "flake.lock",
31        ];
32        if lockfile_names.contains(&name) {
33            return 5;
34        }
35
36        // Check for config/manifest files + key project docs — highest priority
37        let config_names = [
38            // Package manifests
39            "Cargo.toml",
40            "package.json",
41            "tsconfig.json",
42            "pyproject.toml",
43            "setup.py",
44            "setup.cfg",
45            "go.mod",
46            "Gemfile",
47            // Tool config
48            "context-builder.toml",
49            ".gitignore",
50            // Key project documentation (LLMs need these for context)
51            "README.md",
52            "README",
53            "README.txt",
54            "README.rst",
55            "AGENTS.md",
56            "CLAUDE.md",
57            "GEMINI.md",
58            "COPILOT.md",
59            "CONTRIBUTING.md",
60            "CHANGELOG.md",
61        ];
62        if config_names.contains(&name) {
63            return 0;
64        }
65    }
66
67    // Check path prefix for category
68    let first_component = relative
69        .components()
70        .next()
71        .and_then(|c| c.as_os_str().to_str())
72        .unwrap_or("");
73
74    match first_component {
75        "src" | "lib" | "crates" | "packages" | "internal" | "cmd" | "pkg" => {
76            // Check sub-components for test directories within source trees.
77            // e.g., src/tests/auth.rs should be cat 2 (tests), not cat 1 (source).
78            let sub_path = rel_str.as_ref();
79            if sub_path.contains("/tests/")
80                || sub_path.contains("/test/")
81                || sub_path.contains("/spec/")
82                || sub_path.contains("/__tests__/")
83                || sub_path.contains("/benches/")
84                || sub_path.contains("/benchmarks/")
85            {
86                2
87            } else {
88                1
89            }
90        }
91        "tests" | "test" | "spec" | "benches" | "benchmarks" | "__tests__" => 2,
92        "docs" | "doc" | "examples" | "scripts" | "tools" | "assets" => 3,
93        // Build/CI infrastructure — useful context but not core source
94        ".github" | ".circleci" | ".gitlab" | ".buildkite" => 4,
95        _ => {
96            // Check extensions for additional heuristics
97            if let Some(ext) = relative.extension().and_then(|e| e.to_str()) {
98                match ext {
99                    "rs" | "go" | "py" | "ts" | "js" | "java" | "c" | "cpp" | "h" | "hpp"
100                    | "rb" | "swift" | "kt" | "scala" | "ex" | "exs" | "zig" | "hs" => {
101                        // Source file not in a recognized dir — check if it's a test
102                        // Use path boundaries to avoid false positives (e.g., "contest.rs")
103                        if rel_str.contains("/test/")
104                            || rel_str.contains("/tests/")
105                            || rel_str.contains("/spec/")
106                            || rel_str.contains("/__tests__/")
107                            || rel_str.ends_with("_test.rs")
108                            || rel_str.ends_with("_test.go")
109                            || rel_str.ends_with("_spec.rb")
110                            || rel_str.ends_with(".test.ts")
111                            || rel_str.ends_with(".test.js")
112                            || rel_str.ends_with(".spec.ts")
113                            || rel_str.starts_with("test_")
114                        {
115                            2
116                        } else {
117                            1
118                        }
119                    }
120                    "md" | "txt" | "rst" | "adoc" => 3,
121                    _ => 1, // Unknown extension in root — treat as source
122                }
123            } else {
124                // Check for build-related root files without extensions
125                if let Some(
126                    "Makefile" | "CMakeLists.txt" | "Dockerfile" | "Containerfile" | "Justfile"
127                    | "Taskfile" | "Rakefile" | "Vagrantfile",
128                ) = relative.file_name().and_then(|n| n.to_str())
129                {
130                    4
131                } else {
132                    3 // No extension — docs/other
133                }
134            }
135        }
136    }
137}
138
139/// Returns a sub-priority for sorting within the same relevance category.
140/// Lower values appear first. Entry points (main, lib, mod) get priority 0,
141/// other files get priority 1. This ensures LLMs see architectural entry
142/// points before helper modules.
143fn file_entry_point_priority(path: &Path) -> u8 {
144    if let Some("main" | "lib" | "mod" | "index" | "app" | "__init__") =
145        path.file_stem().and_then(|s| s.to_str())
146    {
147        0
148    } else {
149        1
150    }
151}
152
153/// Collects all files to be processed using `ignore` crate for efficient traversal.
154///
155/// `auto_ignores` are runtime-computed exclusion patterns (e.g., the tool's own
156/// output file or cache directory). They are processed identically to user ignores
157/// but kept separate to avoid polluting user-facing configuration.
158pub fn collect_files(
159    base_path: &Path,
160    filters: &[String],
161    ignores: &[String],
162    auto_ignores: &[String],
163) -> io::Result<Vec<DirEntry>> {
164    let mut walker = WalkBuilder::new(base_path);
165    // By default, the "ignore" crate respects .gitignore and hidden files, so we don't need walker.hidden(false)
166
167    // Build overrides for custom ignore patterns
168    let mut override_builder = OverrideBuilder::new(base_path);
169
170    // Hardcoded auto-ignores for common heavy directories that should NEVER be
171    // included, even when there's no .git directory (so .gitignore isn't read).
172    // Without these, projects missing .git can produce million-line outputs
173    // from dependency trees.
174    //
175    // IMPORTANT: These are added FIRST so that user ignores can override them.
176    // The ignore crate uses "last-match-wins" semantics, so a user can whitelist
177    // a legitimate "vendor" or "build" dir by passing it as a filter pattern.
178    //
179    // IMPORTANT: Patterns must NOT contain a slash — the ignore crate anchors
180    // slash-containing patterns to the root, so `!dir/**` would only match
181    // top-level dirs, missing nested ones like `apps/web/node_modules/`.
182    let default_ignores = [
183        "node_modules",
184        "__pycache__",
185        ".venv",
186        "venv",
187        ".tox",
188        ".mypy_cache",
189        ".pytest_cache",
190        ".ruff_cache",
191        "vendor",  // Go, PHP, Ruby
192        ".bundle", // Ruby
193        "bower_components",
194        ".next",       // Next.js build output
195        ".nuxt",       // Nuxt build output
196        ".svelte-kit", // SvelteKit build output
197        ".angular",    // Angular cache
198        "dist",        // Common build output
199        "build",       // Common build output
200        ".gradle",     // Gradle cache
201        ".cargo",      // Cargo registry cache
202    ];
203    for dir in &default_ignores {
204        // No slash in pattern → matches at any depth (not root-anchored)
205        let pattern = format!("!{}", dir);
206        if let Err(e) = override_builder.add(&pattern) {
207            log::warn!("Skipping invalid default-ignore '{}': {}", dir, e);
208        }
209    }
210
211    // User-specified ignore patterns (added AFTER defaults so they can override)
212    for pattern in ignores {
213        // Attention: Confusing pattern ahead!
214        // Add the pattern to the override builder with ! prefix to ignore matching files.
215        // In OverrideBuilder, patterns without ! are whitelist (include) patterns,
216        // while patterns with ! are ignore patterns.
217        let ignore_pattern = format!("!{}", pattern);
218        if let Err(e) = override_builder.add(&ignore_pattern) {
219            return Err(io::Error::new(
220                io::ErrorKind::InvalidInput,
221                format!("Invalid ignore pattern '{}': {}", pattern, e),
222            ));
223        }
224    }
225    // Apply auto-computed ignore patterns (output file, cache dir, etc.)
226    for pattern in auto_ignores {
227        let ignore_pattern = format!("!{}", pattern);
228        if let Err(e) = override_builder.add(&ignore_pattern) {
229            log::warn!("Skipping invalid auto-ignore pattern '{}': {}", pattern, e);
230        }
231    }
232    // Also, always ignore the config file itself
233    if let Err(e) = override_builder.add("!context-builder.toml") {
234        return Err(io::Error::new(
235            io::ErrorKind::InvalidInput,
236            format!("Failed to add config ignore: {}", e),
237        ));
238    }
239
240    let overrides = override_builder.build().map_err(|e| {
241        io::Error::new(
242            io::ErrorKind::InvalidInput,
243            format!("Failed to build overrides: {}", e),
244        )
245    })?;
246    walker.overrides(overrides);
247
248    if !filters.is_empty() {
249        let mut type_builder = ignore::types::TypesBuilder::new();
250        type_builder.add_defaults();
251        for filter in filters {
252            let _ = type_builder.add(filter, &format!("*.{}", filter));
253            type_builder.select(filter);
254        }
255        let types = type_builder.build().unwrap();
256        walker.types(types);
257    }
258
259    let mut files: Vec<DirEntry> = walker
260        .build()
261        .filter_map(Result::ok)
262        .filter(|e| e.file_type().is_some_and(|ft| ft.is_file()))
263        .collect();
264
265    // Sort files by relevance category, then entry-point priority, then alphabetically.
266    // This puts config + docs first, then source code (entry points before helpers),
267    // then tests, then docs/other, then build/CI, then lockfiles.
268    // LLMs comprehend codebases better when core source appears before test scaffolding.
269    files.sort_by(|a, b| {
270        let cat_a = file_relevance_category(a.path(), base_path);
271        let cat_b = file_relevance_category(b.path(), base_path);
272        cat_a
273            .cmp(&cat_b)
274            .then_with(|| {
275                file_entry_point_priority(a.path()).cmp(&file_entry_point_priority(b.path()))
276            })
277            .then_with(|| a.path().cmp(b.path()))
278    });
279
280    Ok(files)
281}
282
283/// Asks for user confirmation if the number of files is large.
284pub fn confirm_processing(file_count: usize) -> io::Result<bool> {
285    if file_count > 100 {
286        print!(
287            "Warning: You're about to process {} files. This might take a while. Continue? [y/N] ",
288            file_count
289        );
290        io::stdout().flush()?;
291        let mut input = String::new();
292        io::stdin().read_line(&mut input)?;
293        if !input.trim().eq_ignore_ascii_case("y") {
294            return Ok(false);
295        }
296    }
297    Ok(true)
298}
299
300/// Asks for user confirmation to overwrite an existing file.
301pub fn confirm_overwrite(file_path: &str) -> io::Result<bool> {
302    print!("The file '{}' already exists. Overwrite? [y/N] ", file_path);
303    io::stdout().flush()?;
304    let mut input = String::new();
305    io::stdin().read_line(&mut input)?;
306
307    if input.trim().eq_ignore_ascii_case("y") {
308        Ok(true)
309    } else {
310        Ok(false)
311    }
312}
313
314pub fn find_latest_file(dir: &Path) -> io::Result<Option<PathBuf>> {
315    if !dir.is_dir() {
316        return Ok(None);
317    }
318
319    let mut latest_file = None;
320    let mut latest_time = std::time::SystemTime::UNIX_EPOCH;
321
322    for entry in fs::read_dir(dir)? {
323        let entry = entry?;
324        let path = entry.path();
325        if path.is_file() {
326            let metadata = fs::metadata(&path)?;
327            let modified = metadata.modified()?;
328            if modified > latest_time {
329                latest_time = modified;
330                latest_file = Some(path);
331            }
332        }
333    }
334
335    Ok(latest_file)
336}
337
338#[cfg(test)]
339mod tests {
340    use super::*;
341    use std::fs;
342    use std::path::Path;
343    use tempfile::tempdir;
344
345    fn to_rel_paths(mut entries: Vec<DirEntry>, base: &Path) -> Vec<String> {
346        entries.sort_by_key(|e| e.path().to_path_buf());
347        entries
348            .iter()
349            .map(|e| {
350                e.path()
351                    .strip_prefix(base)
352                    .unwrap()
353                    .to_string_lossy()
354                    .replace('\\', "/")
355            })
356            .collect()
357    }
358
359    #[test]
360    fn collect_files_respects_filters() {
361        let dir = tempdir().unwrap();
362        let base = dir.path();
363
364        // create files
365        fs::create_dir_all(base.join("src")).unwrap();
366        fs::create_dir_all(base.join("scripts")).unwrap();
367        fs::write(base.join("src").join("main.rs"), "fn main() {}").unwrap();
368        fs::write(base.join("Cargo.toml"), "[package]\nname=\"x\"").unwrap();
369        fs::write(base.join("README.md"), "# readme").unwrap();
370        fs::write(base.join("scripts").join("build.sh"), "#!/bin/sh\n").unwrap();
371
372        let filters = vec!["rs".to_string(), "toml".to_string()];
373        let ignores: Vec<String> = vec![];
374
375        let files = collect_files(base, &filters, &ignores, &[]).unwrap();
376        let relative_paths = to_rel_paths(files, base);
377
378        assert!(relative_paths.contains(&"src/main.rs".to_string()));
379        assert!(relative_paths.contains(&"Cargo.toml".to_string()));
380        assert!(!relative_paths.contains(&"README.md".to_string()));
381        assert!(!relative_paths.contains(&"scripts/build.sh".to_string()));
382    }
383
384    #[test]
385    fn collect_files_respects_ignores_for_dirs_and_files() {
386        let dir = tempdir().unwrap();
387        let base = dir.path();
388
389        fs::create_dir_all(base.join("src")).unwrap();
390        fs::create_dir_all(base.join("target")).unwrap();
391        fs::create_dir_all(base.join("node_modules")).unwrap();
392
393        fs::write(base.join("src").join("main.rs"), "fn main() {}").unwrap();
394        fs::write(base.join("target").join("artifact.txt"), "bin").unwrap();
395        fs::write(base.join("node_modules").join("pkg.js"), "console.log();").unwrap();
396        fs::write(base.join("README.md"), "# readme").unwrap();
397
398        let filters: Vec<String> = vec![];
399        let ignores: Vec<String> = vec!["target".into(), "node_modules".into(), "README.md".into()];
400
401        let files = collect_files(base, &filters, &ignores, &[]).unwrap();
402        let relative_paths = to_rel_paths(files, base);
403
404        assert!(relative_paths.contains(&"src/main.rs".to_string()));
405        assert!(!relative_paths.contains(&"target/artifact.txt".to_string()));
406        assert!(!relative_paths.contains(&"node_modules/pkg.js".to_string()));
407        assert!(!relative_paths.contains(&"README.md".to_string()));
408    }
409
410    #[test]
411    fn collect_files_handles_invalid_ignore_pattern() {
412        let dir = tempdir().unwrap();
413        let base = dir.path();
414
415        fs::create_dir_all(base.join("src")).unwrap();
416        fs::write(base.join("src").join("main.rs"), "fn main() {}").unwrap();
417
418        let filters: Vec<String> = vec![];
419        let ignores: Vec<String> = vec!["[".into()]; // Invalid regex pattern
420
421        let result = collect_files(base, &filters, &ignores, &[]);
422        assert!(result.is_err());
423        assert!(
424            result
425                .unwrap_err()
426                .to_string()
427                .contains("Invalid ignore pattern")
428        );
429    }
430
431    #[test]
432    fn collect_files_empty_directory() {
433        let dir = tempdir().unwrap();
434        let base = dir.path();
435
436        let filters: Vec<String> = vec![];
437        let ignores: Vec<String> = vec![];
438
439        let files = collect_files(base, &filters, &ignores, &[]).unwrap();
440        assert!(files.is_empty());
441    }
442
443    #[test]
444    fn collect_files_no_matching_filters() {
445        let dir = tempdir().unwrap();
446        let base = dir.path();
447
448        fs::write(base.join("README.md"), "# readme").unwrap();
449        fs::write(base.join("script.py"), "print('hello')").unwrap();
450
451        let filters = vec!["rs".to_string()]; // Only Rust files
452        let ignores: Vec<String> = vec![];
453
454        let files = collect_files(base, &filters, &ignores, &[]).unwrap();
455        assert!(files.is_empty());
456    }
457
458    #[test]
459    fn collect_files_ignores_config_file() {
460        let dir = tempdir().unwrap();
461        let base = dir.path();
462
463        fs::write(base.join("context-builder.toml"), "[config]").unwrap();
464        fs::write(base.join("other.toml"), "[other]").unwrap();
465
466        let filters: Vec<String> = vec![];
467        let ignores: Vec<String> = vec![];
468
469        let files = collect_files(base, &filters, &ignores, &[]).unwrap();
470        let relative_paths = to_rel_paths(files, base);
471
472        assert!(!relative_paths.contains(&"context-builder.toml".to_string()));
473        assert!(relative_paths.contains(&"other.toml".to_string()));
474    }
475
476    #[test]
477    fn confirm_processing_small_count() {
478        // Test that small file counts don't require confirmation
479        let result = confirm_processing(50);
480        assert!(result.is_ok());
481        assert!(result.unwrap());
482    }
483
484    #[test]
485    fn find_latest_file_empty_directory() {
486        let dir = tempdir().unwrap();
487        let result = find_latest_file(dir.path()).unwrap();
488        assert!(result.is_none());
489    }
490
491    #[test]
492    fn find_latest_file_nonexistent_directory() {
493        let dir = tempdir().unwrap();
494        let nonexistent = dir.path().join("nonexistent");
495        let result = find_latest_file(&nonexistent).unwrap();
496        assert!(result.is_none());
497    }
498
499    #[test]
500    fn find_latest_file_single_file() {
501        let dir = tempdir().unwrap();
502        let file_path = dir.path().join("test.txt");
503        fs::write(&file_path, "content").unwrap();
504
505        let result = find_latest_file(dir.path()).unwrap();
506        assert!(result.is_some());
507        assert_eq!(result.unwrap(), file_path);
508    }
509
510    #[test]
511    fn find_latest_file_multiple_files() {
512        let dir = tempdir().unwrap();
513
514        let file1 = dir.path().join("old.txt");
515        let file2 = dir.path().join("new.txt");
516
517        fs::write(&file1, "old content").unwrap();
518        std::thread::sleep(std::time::Duration::from_millis(10));
519        fs::write(&file2, "new content").unwrap();
520
521        let result = find_latest_file(dir.path()).unwrap();
522        assert!(result.is_some());
523        assert_eq!(result.unwrap(), file2);
524    }
525
526    #[test]
527    fn find_latest_file_ignores_directories() {
528        let dir = tempdir().unwrap();
529        let subdir = dir.path().join("subdir");
530        fs::create_dir(&subdir).unwrap();
531
532        let file_path = dir.path().join("test.txt");
533        fs::write(&file_path, "content").unwrap();
534
535        let result = find_latest_file(dir.path()).unwrap();
536        assert!(result.is_some());
537        assert_eq!(result.unwrap(), file_path);
538    }
539
540    #[test]
541    fn test_confirm_processing_requires_user_interaction() {
542        // This test verifies the function signature and basic logic for large file counts
543        // The actual user interaction cannot be tested in unit tests
544
545        // For file counts <= 100, should return Ok(true) without prompting
546        // This is already tested implicitly by the fact that small counts don't prompt
547
548        // For file counts > 100, the function would prompt user input
549        // We can't easily test this without mocking stdin, but we can verify
550        // that the function exists and has the expected signature
551        use std::io::Cursor;
552
553        // Create a mock stdin that simulates user typing "y"
554        let input = b"y\n";
555        let _ = Cursor::new(input);
556
557        // We can't easily override stdin in a unit test without complex setup,
558        // so we'll just verify the function exists and handles small counts
559        let result = confirm_processing(50);
560        assert!(result.is_ok());
561        assert!(result.unwrap());
562    }
563
564    #[test]
565    fn test_confirm_overwrite_function_exists() {
566        // Similar to confirm_processing, this function requires user interaction
567        // We can verify it exists and has the expected signature
568
569        // For testing purposes, we know this function prompts for user input
570        // and returns Ok(true) if user types "y" or "Y", Ok(false) otherwise
571
572        // The function signature should be:
573        // pub fn confirm_overwrite(file_path: &str) -> io::Result<bool>
574
575        // We can't easily test the interactive behavior without mocking stdin,
576        // but we can ensure the function compiles and has the right signature
577        let _: fn(&str) -> std::io::Result<bool> = confirm_overwrite;
578    }
579
580    #[test]
581    fn test_collect_files_handles_permission_errors() {
582        // Test what happens when we can't access a directory
583        // This is harder to test portably, but we can test with invalid patterns
584        let dir = tempdir().unwrap();
585        let base = dir.path();
586
587        // Test with a pattern that might cause issues
588        let filters: Vec<String> = vec![];
589        let ignores: Vec<String> = vec!["[invalid".into()]; // Incomplete bracket
590
591        let result = collect_files(base, &filters, &ignores, &[]);
592        assert!(result.is_err());
593    }
594
595    #[test]
596    fn test_find_latest_file_permission_error() {
597        // Test behavior when we can't read directory metadata
598        use std::path::Path;
599
600        // Test with a path that doesn't exist
601        let nonexistent = Path::new("/this/path/should/not/exist/anywhere");
602        let result = find_latest_file(nonexistent);
603
604        // Should return Ok(None) for non-existent directories
605        assert!(result.is_ok());
606        assert!(result.unwrap().is_none());
607    }
608
609    #[test]
610    fn test_collect_files_with_symlinks() {
611        // Test behavior with symbolic links (if supported on platform)
612        let dir = tempdir().unwrap();
613        let base = dir.path();
614
615        // Create a regular file
616        fs::write(base.join("regular.txt"), "content").unwrap();
617
618        // On Unix-like systems, try creating a symlink
619        #[cfg(unix)]
620        {
621            use std::os::unix::fs::symlink;
622            let _ = symlink("regular.txt", base.join("link.txt"));
623        }
624
625        // On Windows, symlinks require special privileges, so skip this part
626        #[cfg(windows)]
627        {
628            // Just create another regular file to test
629            fs::write(base.join("another.txt"), "content2").unwrap();
630        }
631
632        let filters: Vec<String> = vec![];
633        let ignores: Vec<String> = vec![];
634
635        let files = collect_files(base, &filters, &ignores, &[]).unwrap();
636        // Should find at least the regular file
637        assert!(!files.is_empty());
638    }
639}