Skip to main content

gobby_code/index/
walker.rs

1//! Git-aware file discovery using the `ignore` crate.
2//! Respects .gitignore and exclude patterns.
3
4use std::path::{Path, PathBuf};
5
6use crate::index::languages;
7use crate::index::security;
8
9/// Maximum file size to index (10 MB).
10const MAX_FILE_SIZE: u64 = 10 * 1024 * 1024;
11
12/// How a file should be indexed.
13#[derive(Debug, Clone, Copy, PartialEq, Eq)]
14pub enum FileClassification {
15    Ast,
16    ContentOnly,
17}
18
19/// Discover files eligible for indexing under `root`.
20/// Returns (ast_candidates, content_only_candidates) as absolute paths.
21pub fn discover_files(root: &Path, exclude_patterns: &[String]) -> (Vec<PathBuf>, Vec<PathBuf>) {
22    let mut candidates = Vec::new();
23    let mut content_only = Vec::new();
24
25    let mut settings = gobby_core::indexing::WalkerSettings::new(root);
26    settings.max_filesize = Some(MAX_FILE_SIZE);
27    let mut builder = settings.into_walker();
28    builder.hidden(true);
29    let walker = builder.build();
30
31    for entry in walker.flatten() {
32        let path = entry.path();
33        if !path.is_file() {
34            continue;
35        }
36
37        match classify_file(root, path, exclude_patterns) {
38            Some(FileClassification::Ast) => candidates.push(path.to_path_buf()),
39            Some(FileClassification::ContentOnly) => content_only.push(path.to_path_buf()),
40            None => {}
41        }
42    }
43
44    (candidates, content_only)
45}
46
47/// Classify an individual file for indexing.
48pub fn classify_file(
49    root: &Path,
50    path: &Path,
51    exclude_patterns: &[String],
52) -> Option<FileClassification> {
53    if !is_safe_text_file(root, path, exclude_patterns) {
54        return None;
55    }
56
57    if languages::detect_language(&path.to_string_lossy()).is_some() {
58        Some(FileClassification::Ast)
59    } else {
60        Some(FileClassification::ContentOnly)
61    }
62}
63
64/// Return true when `path` is an unsupported, safe text file suitable for chunks.
65pub fn is_content_indexable(root: &Path, path: &Path, exclude_patterns: &[String]) -> bool {
66    matches!(
67        classify_file(root, path, exclude_patterns),
68        Some(FileClassification::ContentOnly)
69    )
70}
71
72/// Language label for content-only files.
73pub fn content_language(path: &Path) -> String {
74    path.extension()
75        .map(|e| e.to_string_lossy().to_lowercase())
76        .filter(|ext| !ext.is_empty())
77        .unwrap_or_else(|| "text".to_string())
78}
79
80fn is_safe_text_file(root: &Path, path: &Path, exclude_patterns: &[String]) -> bool {
81    if !path.is_file() {
82        return false;
83    }
84    if !security::validate_path(path, root) {
85        return false;
86    }
87    if !security::is_symlink_safe(path, root) {
88        return false;
89    }
90    if security::should_exclude_path(root, path, exclude_patterns) {
91        return false;
92    }
93    if security::has_secret_extension(path) {
94        return false;
95    }
96
97    let Ok(meta) = path.metadata() else {
98        return false;
99    };
100    if meta.len() == 0 || meta.len() > MAX_FILE_SIZE {
101        return false;
102    }
103
104    !security::is_binary(path)
105}
106
107#[cfg(test)]
108mod tests {
109    use super::*;
110
111    fn write_file(root: &Path, rel: &str, contents: &[u8]) {
112        let path = root.join(rel);
113        if let Some(parent) = path.parent() {
114            std::fs::create_dir_all(parent).expect("create parent");
115        }
116        std::fs::write(path, contents).expect("write file");
117    }
118
119    fn rels(root: &Path, paths: Vec<PathBuf>) -> Vec<String> {
120        let mut rels: Vec<String> = paths
121            .into_iter()
122            .map(|path| {
123                path.strip_prefix(root)
124                    .expect("path under root")
125                    .to_string_lossy()
126                    .to_string()
127            })
128            .collect();
129        rels.sort();
130        rels
131    }
132
133    #[test]
134    fn discovers_ast_and_content_only_text_files() {
135        let tmp = tempfile::tempdir().expect("tempdir");
136        let root = tmp.path();
137        write_file(root, "README.md", b"# Title\n");
138        write_file(root, "skills/gcode/SKILL.md", b"# gcode\n");
139        write_file(root, "src/lib.rs", b"fn main() {}\n");
140        write_file(root, "docs/guide.rst", b"Guide\n=====\n");
141        write_file(root, "notes.txt", b"plain notes\n");
142        write_file(root, "config/app.properties", b"mode=dev\n");
143        write_file(root, "config/app.toml", b"mode = 'dev'\n");
144        write_file(root, "scripts/setup.sh", b"#!/usr/bin/env bash\n");
145        write_file(root, "Dockerfile", b"FROM rust:latest\n");
146        write_file(root, "image.bin", b"PNG\0binary");
147        write_file(root, "api_key.txt", b"secret-ish\n");
148        write_file(root, "target/generated.txt", b"generated\n");
149
150        let excludes = vec!["target".to_string()];
151        let (ast, content_only) = discover_files(root, &excludes);
152
153        // discover_files omits api_key.txt via the security module
154        // (SECRET_SUBSTRINGS matches "api_key"), image.bin via binary
155        // detection, and target/* via the explicit excludes vector.
156        assert_eq!(
157            rels(root, ast),
158            vec!["README.md", "skills/gcode/SKILL.md", "src/lib.rs"]
159        );
160        assert_eq!(
161            rels(root, content_only),
162            vec![
163                "Dockerfile",
164                "config/app.properties",
165                "config/app.toml",
166                "docs/guide.rst",
167                "notes.txt",
168                "scripts/setup.sh"
169            ]
170        );
171    }
172
173    #[test]
174    fn classifies_extensionless_text_as_content_only() {
175        let tmp = tempfile::tempdir().expect("tempdir");
176        let root = tmp.path();
177        write_file(root, "Makefile", b"test:\n\tcargo test\n");
178        let excludes = Vec::new();
179
180        assert_eq!(
181            classify_file(root, &root.join("Makefile"), &excludes),
182            Some(FileClassification::ContentOnly)
183        );
184        assert_eq!(content_language(&root.join("Makefile")), "text");
185    }
186
187    #[test]
188    fn classifies_source_build_directory_as_ast_indexable() {
189        let tmp = tempfile::tempdir().expect("tempdir");
190        let root = tmp.path();
191        write_file(
192            root,
193            "src/gobby/build/workspaces.py",
194            b"class WorkspaceBuilder:\n    pass\n",
195        );
196        let excludes = vec!["build".to_string(), "dist".to_string()];
197
198        assert_eq!(
199            classify_file(root, &root.join("src/gobby/build/workspaces.py"), &excludes),
200            Some(FileClassification::Ast)
201        );
202    }
203
204    #[test]
205    fn skips_root_build_directory() {
206        let tmp = tempfile::tempdir().expect("tempdir");
207        let root = tmp.path();
208        write_file(root, "build/generated.py", b"class Generated:\n    pass\n");
209        let excludes = vec!["build".to_string(), "dist".to_string()];
210
211        assert_eq!(
212            classify_file(root, &root.join("build/generated.py"), &excludes),
213            None
214        );
215    }
216
217    #[test]
218    fn walker_consumes_gobby_core_walker_settings() {
219        let source = include_str!("walker.rs");
220        let settings = ["gobby_core", "::indexing::WalkerSettings"].concat();
221        let direct_builder = ["WalkBuilder", "::new(root)"].concat();
222
223        assert!(source.contains(&settings));
224        assert!(!source.contains(&direct_builder));
225    }
226}