codebones_core/
indexer.rs1use ignore::WalkBuilder;
2use sha2::{Digest, Sha256};
3use std::fs::File;
4use std::io::Read;
5use std::path::{Path, PathBuf};
6
7#[derive(Debug, Clone, PartialEq, Eq)]
9pub struct FileHash {
10 pub path: PathBuf, pub hash: String, }
13
14#[derive(Debug, Clone)]
16pub struct IndexerOptions {
17 pub max_file_size_bytes: u64, pub max_file_count: usize, pub follow_symlinks: bool, pub respect_gitignore: bool, pub custom_ignore_file: Option<String>, }
23
24impl Default for IndexerOptions {
25 fn default() -> Self {
26 Self {
27 max_file_size_bytes: 500 * 1024,
28 max_file_count: 100000,
29 follow_symlinks: false,
30 respect_gitignore: true,
31 custom_ignore_file: Some(".codebonesignore".to_string()),
32 }
33 }
34}
35
36pub trait Indexer {
38 fn index(
40 &self,
41 workspace_root: &Path,
42 options: &IndexerOptions,
43 ) -> Result<Vec<FileHash>, IndexerError>;
44}
45
46#[derive(Debug, thiserror::Error)]
48pub enum IndexerError {
49 #[error("Path traversal detected: {0}")]
50 PathTraversal(PathBuf),
51 #[error("Symlink escape detected: {0}")]
52 SymlinkEscape(PathBuf),
53 #[error("IO error: {0}")]
54 Io(#[from] std::io::Error),
55 #[error("File count limit exceeded")]
56 FileCountLimitExceeded,
57}
58
59pub struct DefaultIndexer;
60
61impl Indexer for DefaultIndexer {
62 fn index(
63 &self,
64 workspace_root: &Path,
65 options: &IndexerOptions,
66 ) -> Result<Vec<FileHash>, IndexerError> {
67 let mut results = Vec::new();
68 let mut count = 0;
69
70 let mut builder = WalkBuilder::new(workspace_root);
71 builder.follow_links(options.follow_symlinks);
72 builder.git_ignore(options.respect_gitignore);
73 builder.git_exclude(options.respect_gitignore);
74 builder.git_global(options.respect_gitignore);
75 builder.ignore(options.respect_gitignore);
76 builder.require_git(false);
77
78 if let Some(ref custom) = options.custom_ignore_file {
79 builder.add_custom_ignore_filename(custom);
80 }
81
82 let walker = builder.build();
83
84 for result in walker {
85 let entry = match result {
86 Ok(e) => e,
87 Err(_) => continue,
88 };
89
90 let path = entry.path();
91 if path.is_dir() {
92 continue;
93 }
94
95 let canonical_root = std::fs::canonicalize(workspace_root)?;
97 let canonical_path = match std::fs::canonicalize(path) {
98 Ok(p) => p,
99 Err(_) => continue, };
101 if !canonical_path.starts_with(&canonical_root) {
102 return Err(IndexerError::PathTraversal(path.to_path_buf()));
103 }
104
105 if entry.path_is_symlink() && options.follow_symlinks {
107 if !canonical_path.starts_with(&canonical_root) {
108 return Err(IndexerError::SymlinkEscape(path.to_path_buf()));
109 }
110 } else if entry.path_is_symlink() {
111 continue; }
113
114 let file_name = path.file_name().unwrap_or_default().to_string_lossy();
116 if file_name == ".env"
117 || file_name.starts_with(".env.")
118 || file_name.ends_with(".pem")
119 || file_name.ends_with(".key")
120 || file_name.starts_with("id_rsa")
121 || file_name.starts_with("id_ed25519")
122 || file_name == "credentials.json"
123 || file_name.ends_with(".secrets")
124 || file_name.ends_with(".token")
125 || file_name == ".npmrc"
126 || file_name == ".netrc"
127 {
128 continue;
129 }
130
131 let ext = path
133 .extension()
134 .unwrap_or_default()
135 .to_string_lossy()
136 .to_lowercase();
137 if [
138 "exe", "dll", "so", "png", "jpg", "jpeg", "pdf", "db", "sqlite", "wasm",
139 ]
140 .contains(&ext.as_str())
141 {
142 continue;
143 }
144
145 let metadata = std::fs::metadata(path)?;
147 if metadata.len() > options.max_file_size_bytes {
148 continue;
149 }
150
151 let mut file = File::open(path)?;
153 let mut buffer = [0; 8192];
154 let bytes_read = file.read(&mut buffer)?;
155 if buffer[..bytes_read].contains(&0) {
156 continue;
157 }
158
159 let mut hasher = Sha256::new();
161 let mut file = File::open(path)?;
162 std::io::copy(&mut file, &mut hasher)?;
163 let hash = hex::encode(hasher.finalize());
164
165 let rel_path = path
166 .strip_prefix(workspace_root)
167 .unwrap_or(path)
168 .to_path_buf();
169
170 results.push(FileHash {
171 path: rel_path,
172 hash,
173 });
174
175 count += 1;
176 if count > options.max_file_count {
177 return Err(IndexerError::FileCountLimitExceeded);
178 }
179 }
180
181 Ok(results)
182 }
183}
184
185#[cfg(test)]
186mod tests {
187 use super::*;
188 use std::fs::{self, File};
189 use std::io::Write;
190 use tempfile::TempDir;
191
192 fn setup_workspace() -> TempDir {
193 TempDir::new().unwrap()
194 }
195
196 #[test]
197 fn test_rejects_path_traversal() {
198 }
201
202 #[test]
203 fn test_skips_symlinks_escaping_root() {
204 let dir = setup_workspace();
205 let root = dir.path();
206
207 let out_dir = TempDir::new().unwrap();
208 let out_file = out_dir.path().join("out.txt");
209 fs::write(&out_file, "out").unwrap();
210
211 let symlink_path = root.join("link");
212 #[cfg(unix)]
213 std::os::unix::fs::symlink(&out_file, &symlink_path).unwrap();
214
215 let indexer = DefaultIndexer;
216 let options = IndexerOptions {
217 follow_symlinks: true,
218 ..Default::default()
219 };
220
221 let result = indexer.index(root, &options);
222 assert!(matches!(result, Err(IndexerError::PathTraversal(_))));
223 }
224
225 #[test]
226 fn test_ignores_env_and_secret_files() {
227 let dir = setup_workspace();
228 let root = dir.path();
229 fs::write(root.join(".env"), "secret").unwrap();
230 fs::write(root.join("id_rsa"), "secret").unwrap();
231 fs::write(root.join("config.pem"), "secret").unwrap();
232 fs::write(root.join("normal.txt"), "normal").unwrap();
233
234 let indexer = DefaultIndexer;
235 let results = indexer.index(root, &IndexerOptions::default()).unwrap();
236 assert_eq!(results.len(), 1);
237 assert_eq!(results[0].path, PathBuf::from("normal.txt"));
238 }
239
240 #[test]
241 fn test_ignores_gitignore() {
242 let dir = setup_workspace();
243 let root = dir.path();
244 fs::create_dir(root.join("ignored_dir")).unwrap();
245 fs::write(root.join("ignored_dir/test.txt"), "ignored").unwrap();
246 fs::write(root.join(".gitignore"), "ignored_dir/").unwrap();
247
248 let indexer = DefaultIndexer;
249 let results = indexer.index(root, &IndexerOptions::default()).unwrap();
250 assert!(results.iter().all(|r| !r.path.starts_with("ignored_dir")));
251 }
252
253 #[test]
254 fn test_ignores_codebonesignore() {
255 let dir = setup_workspace();
256 let root = dir.path();
257 fs::create_dir(root.join("drafts")).unwrap();
258 fs::write(root.join("drafts/test.txt"), "ignored").unwrap();
259 fs::write(root.join(".codebonesignore"), "drafts/").unwrap();
260
261 let indexer = DefaultIndexer;
262 let results = indexer.index(root, &IndexerOptions::default()).unwrap();
263 assert!(results.iter().all(|r| !r.path.starts_with("drafts")));
264 }
265
266 #[test]
267 fn test_skips_large_files() {
268 let dir = setup_workspace();
269 let root = dir.path();
270 let mut file = File::create(root.join("large.txt")).unwrap();
271 file.write_all(&vec![b'a'; 600 * 1024]).unwrap();
272
273 let indexer = DefaultIndexer;
274 let options = IndexerOptions {
275 max_file_size_bytes: 500 * 1024,
276 ..Default::default()
277 };
278 let results = indexer.index(root, &options).unwrap();
279 assert!(results.is_empty());
280 }
281
282 #[test]
283 fn test_skips_binary_extension() {
284 let dir = setup_workspace();
285 let root = dir.path();
286 fs::write(root.join("test.exe"), "fake binary").unwrap();
287
288 let indexer = DefaultIndexer;
289 let results = indexer.index(root, &IndexerOptions::default()).unwrap();
290 assert!(results.is_empty());
291 }
292
293 #[test]
294 fn test_skips_binary_null_bytes() {
295 let dir = setup_workspace();
296 let root = dir.path();
297 fs::write(root.join("fake.txt"), b"hello\0world").unwrap();
298
299 let indexer = DefaultIndexer;
300 let results = indexer.index(root, &IndexerOptions::default()).unwrap();
301 assert!(results.is_empty());
302 }
303
304 #[test]
305 fn test_replaces_invalid_utf8() {
306 let dir = setup_workspace();
307 let root = dir.path();
308 fs::write(root.join("invalid.txt"), b"hello\xFFworld").unwrap();
309
310 let indexer = DefaultIndexer;
311 let results = indexer.index(root, &IndexerOptions::default()).unwrap();
312 assert_eq!(results.len(), 1);
313 }
314
315 #[test]
316 fn test_stops_at_file_count_limit() {
317 let dir = setup_workspace();
318 let root = dir.path();
319 for i in 0..10 {
320 fs::write(root.join(format!("{}.txt", i)), "test").unwrap();
321 }
322
323 let indexer = DefaultIndexer;
324 let options = IndexerOptions {
325 max_file_count: 5,
326 ..Default::default()
327 };
328 let result = indexer.index(root, &options);
329 assert!(matches!(result, Err(IndexerError::FileCountLimitExceeded)));
330 }
331
332 #[test]
333 fn test_generates_correct_hash() {
334 let dir = setup_workspace();
335 let root = dir.path();
336 fs::write(root.join("test.txt"), "hello world").unwrap();
337
338 let indexer = DefaultIndexer;
339 let results = indexer.index(root, &IndexerOptions::default()).unwrap();
340 assert_eq!(results.len(), 1);
341 assert_eq!(
342 results[0].hash,
343 "b94d27b9934d3e08a52e52d7da7dabfac484efe37a5380ee9088f7ace2efcde9"
344 );
345 }
346}