1use std::path::{Path, PathBuf};
5
6use crate::index::languages;
7use crate::index::security;
8
9const MAX_FILE_SIZE: u64 = 10 * 1024 * 1024;
11
12#[derive(Debug, Clone, Copy, PartialEq, Eq)]
14pub enum FileClassification {
15 Ast,
16 ContentOnly,
17}
18
19pub fn discover_files(root: &Path, exclude_patterns: &[String]) -> (Vec<PathBuf>, Vec<PathBuf>) {
22 let mut candidates = Vec::new();
23 let mut content_only = Vec::new();
24
25 let mut settings = gobby_core::indexing::WalkerSettings::new(root);
26 settings.max_filesize = Some(MAX_FILE_SIZE);
27 let mut builder = settings.into_walker();
28 builder.hidden(true);
29 let walker = builder.build();
30
31 for entry in walker.flatten() {
32 let path = entry.path();
33 if !path.is_file() {
34 continue;
35 }
36
37 match classify_file(root, path, exclude_patterns) {
38 Some(FileClassification::Ast) => candidates.push(path.to_path_buf()),
39 Some(FileClassification::ContentOnly) => content_only.push(path.to_path_buf()),
40 None => {}
41 }
42 }
43
44 (candidates, content_only)
45}
46
47pub fn classify_file(
49 root: &Path,
50 path: &Path,
51 exclude_patterns: &[String],
52) -> Option<FileClassification> {
53 if !is_safe_text_file(root, path, exclude_patterns) {
54 return None;
55 }
56
57 if languages::detect_language(&path.to_string_lossy()).is_some() {
58 Some(FileClassification::Ast)
59 } else {
60 Some(FileClassification::ContentOnly)
61 }
62}
63
64pub fn is_content_indexable(root: &Path, path: &Path, exclude_patterns: &[String]) -> bool {
66 matches!(
67 classify_file(root, path, exclude_patterns),
68 Some(FileClassification::ContentOnly)
69 )
70}
71
72pub fn content_language(path: &Path) -> String {
74 path.extension()
75 .map(|e| e.to_string_lossy().to_lowercase())
76 .filter(|ext| !ext.is_empty())
77 .unwrap_or_else(|| "text".to_string())
78}
79
80fn is_safe_text_file(root: &Path, path: &Path, exclude_patterns: &[String]) -> bool {
81 if !path.is_file() {
82 return false;
83 }
84 if !security::validate_path(path, root) {
85 return false;
86 }
87 if !security::is_symlink_safe(path, root) {
88 return false;
89 }
90 if security::should_exclude_path(root, path, exclude_patterns) {
91 return false;
92 }
93 if security::has_secret_extension(path) {
94 return false;
95 }
96
97 let Ok(meta) = path.metadata() else {
98 return false;
99 };
100 if meta.len() == 0 || meta.len() > MAX_FILE_SIZE {
101 return false;
102 }
103
104 !security::is_binary(path)
105}
106
107#[cfg(test)]
108mod tests {
109 use super::*;
110
111 fn write_file(root: &Path, rel: &str, contents: &[u8]) {
112 let path = root.join(rel);
113 if let Some(parent) = path.parent() {
114 std::fs::create_dir_all(parent).expect("create parent");
115 }
116 std::fs::write(path, contents).expect("write file");
117 }
118
119 fn rels(root: &Path, paths: Vec<PathBuf>) -> Vec<String> {
120 let mut rels: Vec<String> = paths
121 .into_iter()
122 .map(|path| {
123 path.strip_prefix(root)
124 .expect("path under root")
125 .to_string_lossy()
126 .to_string()
127 })
128 .collect();
129 rels.sort();
130 rels
131 }
132
133 #[test]
134 fn discovers_ast_and_content_only_text_files() {
135 let tmp = tempfile::tempdir().expect("tempdir");
136 let root = tmp.path();
137 write_file(root, "README.md", b"# Title\n");
138 write_file(root, "skills/gcode/SKILL.md", b"# gcode\n");
139 write_file(root, "src/lib.rs", b"fn main() {}\n");
140 write_file(root, "docs/guide.rst", b"Guide\n=====\n");
141 write_file(root, "notes.txt", b"plain notes\n");
142 write_file(root, "config/app.properties", b"mode=dev\n");
143 write_file(root, "config/app.toml", b"mode = 'dev'\n");
144 write_file(root, "scripts/setup.sh", b"#!/usr/bin/env bash\n");
145 write_file(root, "Dockerfile", b"FROM rust:latest\n");
146 write_file(root, "image.bin", b"PNG\0binary");
147 write_file(root, "api_key.txt", b"secret-ish\n");
148 write_file(root, "target/generated.txt", b"generated\n");
149
150 let excludes = vec!["target".to_string()];
151 let (ast, content_only) = discover_files(root, &excludes);
152
153 assert_eq!(
157 rels(root, ast),
158 vec!["README.md", "skills/gcode/SKILL.md", "src/lib.rs"]
159 );
160 assert_eq!(
161 rels(root, content_only),
162 vec![
163 "Dockerfile",
164 "config/app.properties",
165 "config/app.toml",
166 "docs/guide.rst",
167 "notes.txt",
168 "scripts/setup.sh"
169 ]
170 );
171 }
172
173 #[test]
174 fn classifies_extensionless_text_as_content_only() {
175 let tmp = tempfile::tempdir().expect("tempdir");
176 let root = tmp.path();
177 write_file(root, "Makefile", b"test:\n\tcargo test\n");
178 let excludes = Vec::new();
179
180 assert_eq!(
181 classify_file(root, &root.join("Makefile"), &excludes),
182 Some(FileClassification::ContentOnly)
183 );
184 assert_eq!(content_language(&root.join("Makefile")), "text");
185 }
186
187 #[test]
188 fn classifies_source_build_directory_as_ast_indexable() {
189 let tmp = tempfile::tempdir().expect("tempdir");
190 let root = tmp.path();
191 write_file(
192 root,
193 "src/gobby/build/workspaces.py",
194 b"class WorkspaceBuilder:\n pass\n",
195 );
196 let excludes = vec!["build".to_string(), "dist".to_string()];
197
198 assert_eq!(
199 classify_file(root, &root.join("src/gobby/build/workspaces.py"), &excludes),
200 Some(FileClassification::Ast)
201 );
202 }
203
204 #[test]
205 fn skips_root_build_directory() {
206 let tmp = tempfile::tempdir().expect("tempdir");
207 let root = tmp.path();
208 write_file(root, "build/generated.py", b"class Generated:\n pass\n");
209 let excludes = vec!["build".to_string(), "dist".to_string()];
210
211 assert_eq!(
212 classify_file(root, &root.join("build/generated.py"), &excludes),
213 None
214 );
215 }
216
217 #[test]
218 fn walker_consumes_gobby_core_walker_settings() {
219 let source = include_str!("walker.rs");
220 let settings = ["gobby_core", "::indexing::WalkerSettings"].concat();
221 let direct_builder = ["WalkBuilder", "::new(root)"].concat();
222
223 assert!(source.contains(&settings));
224 assert!(!source.contains(&direct_builder));
225 }
226}