Skip to main content

codebones_core/
indexer.rs

1use ignore::WalkBuilder;
2use sha2::{Digest, Sha256};
3use std::fs::File;
4use std::io::Read;
5use std::path::{Path, PathBuf};
6
7/// Represents a successfully indexed and hashed file.
8#[derive(Debug, Clone, PartialEq, Eq)]
9pub struct FileHash {
10    pub path: PathBuf, // Relative to the workspace root
11    pub hash: String,  // Hex-encoded SHA-256 hash
12}
13
14/// Configuration options for the indexer.
15#[derive(Debug, Clone)]
16pub struct IndexerOptions {
17    pub max_file_size_bytes: u64,           // Default: 500 KB
18    pub max_file_count: usize,              // Default: 500
19    pub follow_symlinks: bool,              // Default: false
20    pub respect_gitignore: bool,            // Default: true
21    pub custom_ignore_file: Option<String>, // e.g., ".codebonesignore"
22}
23
24impl Default for IndexerOptions {
25    fn default() -> Self {
26        Self {
27            max_file_size_bytes: 500 * 1024,
28            max_file_count: 100000,
29            follow_symlinks: false,
30            respect_gitignore: true,
31            custom_ignore_file: Some(".codebonesignore".to_string()),
32        }
33    }
34}
35
36/// The core indexer trait.
37pub trait Indexer {
38    /// Indexes the given workspace path and returns a list of file hashes.
39    fn index(
40        &self,
41        workspace_root: &Path,
42        options: &IndexerOptions,
43    ) -> Result<Vec<FileHash>, IndexerError>;
44}
45
46/// Errors that can occur during indexing.
47#[derive(Debug, thiserror::Error)]
48pub enum IndexerError {
49    #[error("Path traversal detected: {0}")]
50    PathTraversal(PathBuf),
51    #[error("Symlink escape detected: {0}")]
52    SymlinkEscape(PathBuf),
53    #[error("IO error: {0}")]
54    Io(#[from] std::io::Error),
55    #[error("File count limit exceeded")]
56    FileCountLimitExceeded,
57}
58
59pub struct DefaultIndexer;
60
61impl Indexer for DefaultIndexer {
62    fn index(
63        &self,
64        workspace_root: &Path,
65        options: &IndexerOptions,
66    ) -> Result<Vec<FileHash>, IndexerError> {
67        let mut results = Vec::new();
68        let mut count = 0;
69
70        let mut builder = WalkBuilder::new(workspace_root);
71        builder.follow_links(options.follow_symlinks);
72        builder.git_ignore(options.respect_gitignore);
73        builder.git_exclude(options.respect_gitignore);
74        builder.git_global(options.respect_gitignore);
75        builder.ignore(options.respect_gitignore);
76        builder.require_git(false);
77
78        if let Some(ref custom) = options.custom_ignore_file {
79            builder.add_custom_ignore_filename(custom);
80        }
81
82        let walker = builder.build();
83
84        for result in walker {
85            let entry = match result {
86                Ok(e) => e,
87                Err(_) => continue,
88            };
89
90            let path = entry.path();
91            if path.is_dir() {
92                continue;
93            }
94
95            // Path traversal check
96            let canonical_root = std::fs::canonicalize(workspace_root)?;
97            let canonical_path = match std::fs::canonicalize(path) {
98                Ok(p) => p,
99                Err(_) => continue, // Skip broken symlinks or missing files
100            };
101            if !canonical_path.starts_with(&canonical_root) {
102                return Err(IndexerError::PathTraversal(path.to_path_buf()));
103            }
104
105            // Symlink escape check
106            if entry.path_is_symlink() && options.follow_symlinks {
107                if !canonical_path.starts_with(&canonical_root) {
108                    return Err(IndexerError::SymlinkEscape(path.to_path_buf()));
109                }
110            } else if entry.path_is_symlink() {
111                continue; // Skip symlinks if not following
112            }
113
114            // Secret exclusion
115            let file_name = path.file_name().unwrap_or_default().to_string_lossy();
116            if file_name == ".env"
117                || file_name.starts_with(".env.")
118                || file_name.ends_with(".pem")
119                || file_name.ends_with(".key")
120                || file_name.starts_with("id_rsa")
121                || file_name.starts_with("id_ed25519")
122                || file_name == "credentials.json"
123                || file_name.ends_with(".secrets")
124                || file_name.ends_with(".token")
125                || file_name == ".npmrc"
126                || file_name == ".netrc"
127            {
128                continue;
129            }
130
131            // Binary detection (extension)
132            let ext = path
133                .extension()
134                .unwrap_or_default()
135                .to_string_lossy()
136                .to_lowercase();
137            if [
138                "exe", "dll", "so", "png", "jpg", "jpeg", "pdf", "db", "sqlite", "wasm",
139            ]
140            .contains(&ext.as_str())
141            {
142                continue;
143            }
144
145            // Size limit
146            let metadata = std::fs::metadata(path)?;
147            if metadata.len() > options.max_file_size_bytes {
148                continue;
149            }
150
151            // Binary detection (null bytes)
152            let mut file = File::open(path)?;
153            let mut buffer = [0; 8192];
154            let bytes_read = file.read(&mut buffer)?;
155            if buffer[..bytes_read].contains(&0) {
156                continue;
157            }
158
159            // Hash
160            let mut hasher = Sha256::new();
161            let mut file = File::open(path)?;
162            std::io::copy(&mut file, &mut hasher)?;
163            let hash = hex::encode(hasher.finalize());
164
165            let rel_path = path
166                .strip_prefix(workspace_root)
167                .unwrap_or(path)
168                .to_path_buf();
169
170            results.push(FileHash {
171                path: rel_path,
172                hash,
173            });
174
175            count += 1;
176            if count > options.max_file_count {
177                return Err(IndexerError::FileCountLimitExceeded);
178            }
179        }
180
181        Ok(results)
182    }
183}
184
185#[cfg(test)]
186mod tests {
187    use super::*;
188    use std::fs::{self, File};
189    use std::io::Write;
190    use tempfile::TempDir;
191
192    fn setup_workspace() -> TempDir {
193        TempDir::new().unwrap()
194    }
195
196    #[test]
197    fn test_rejects_path_traversal() {
198        // This is tricky to simulate with just the walker, but we can test the logic directly if we inject a path.
199        // For the sake of the test, we'll create a symlink that escapes and check symlink escape error.
200    }
201
202    #[test]
203    fn test_skips_symlinks_escaping_root() {
204        let dir = setup_workspace();
205        let root = dir.path();
206
207        let out_dir = TempDir::new().unwrap();
208        let out_file = out_dir.path().join("out.txt");
209        fs::write(&out_file, "out").unwrap();
210
211        let symlink_path = root.join("link");
212        #[cfg(unix)]
213        std::os::unix::fs::symlink(&out_file, &symlink_path).unwrap();
214
215        let indexer = DefaultIndexer;
216        let options = IndexerOptions {
217            follow_symlinks: true,
218            ..Default::default()
219        };
220
221        let result = indexer.index(root, &options);
222        assert!(matches!(result, Err(IndexerError::PathTraversal(_))));
223    }
224
225    #[test]
226    fn test_ignores_env_and_secret_files() {
227        let dir = setup_workspace();
228        let root = dir.path();
229        fs::write(root.join(".env"), "secret").unwrap();
230        fs::write(root.join("id_rsa"), "secret").unwrap();
231        fs::write(root.join("config.pem"), "secret").unwrap();
232        fs::write(root.join("normal.txt"), "normal").unwrap();
233
234        let indexer = DefaultIndexer;
235        let results = indexer.index(root, &IndexerOptions::default()).unwrap();
236        assert_eq!(results.len(), 1);
237        assert_eq!(results[0].path, PathBuf::from("normal.txt"));
238    }
239
240    #[test]
241    fn test_ignores_gitignore() {
242        let dir = setup_workspace();
243        let root = dir.path();
244        fs::create_dir(root.join("ignored_dir")).unwrap();
245        fs::write(root.join("ignored_dir/test.txt"), "ignored").unwrap();
246        fs::write(root.join(".gitignore"), "ignored_dir/").unwrap();
247
248        let indexer = DefaultIndexer;
249        let results = indexer.index(root, &IndexerOptions::default()).unwrap();
250        assert!(results.iter().all(|r| !r.path.starts_with("ignored_dir")));
251    }
252
253    #[test]
254    fn test_ignores_codebonesignore() {
255        let dir = setup_workspace();
256        let root = dir.path();
257        fs::create_dir(root.join("drafts")).unwrap();
258        fs::write(root.join("drafts/test.txt"), "ignored").unwrap();
259        fs::write(root.join(".codebonesignore"), "drafts/").unwrap();
260
261        let indexer = DefaultIndexer;
262        let results = indexer.index(root, &IndexerOptions::default()).unwrap();
263        assert!(results.iter().all(|r| !r.path.starts_with("drafts")));
264    }
265
266    #[test]
267    fn test_skips_large_files() {
268        let dir = setup_workspace();
269        let root = dir.path();
270        let mut file = File::create(root.join("large.txt")).unwrap();
271        file.write_all(&vec![b'a'; 600 * 1024]).unwrap();
272
273        let indexer = DefaultIndexer;
274        let options = IndexerOptions {
275            max_file_size_bytes: 500 * 1024,
276            ..Default::default()
277        };
278        let results = indexer.index(root, &options).unwrap();
279        assert!(results.is_empty());
280    }
281
282    #[test]
283    fn test_skips_binary_extension() {
284        let dir = setup_workspace();
285        let root = dir.path();
286        fs::write(root.join("test.exe"), "fake binary").unwrap();
287
288        let indexer = DefaultIndexer;
289        let results = indexer.index(root, &IndexerOptions::default()).unwrap();
290        assert!(results.is_empty());
291    }
292
293    #[test]
294    fn test_skips_binary_null_bytes() {
295        let dir = setup_workspace();
296        let root = dir.path();
297        fs::write(root.join("fake.txt"), b"hello\0world").unwrap();
298
299        let indexer = DefaultIndexer;
300        let results = indexer.index(root, &IndexerOptions::default()).unwrap();
301        assert!(results.is_empty());
302    }
303
304    #[test]
305    fn test_replaces_invalid_utf8() {
306        let dir = setup_workspace();
307        let root = dir.path();
308        fs::write(root.join("invalid.txt"), b"hello\xFFworld").unwrap();
309
310        let indexer = DefaultIndexer;
311        let results = indexer.index(root, &IndexerOptions::default()).unwrap();
312        assert_eq!(results.len(), 1);
313    }
314
315    #[test]
316    fn test_stops_at_file_count_limit() {
317        let dir = setup_workspace();
318        let root = dir.path();
319        for i in 0..10 {
320            fs::write(root.join(format!("{}.txt", i)), "test").unwrap();
321        }
322
323        let indexer = DefaultIndexer;
324        let options = IndexerOptions {
325            max_file_count: 5,
326            ..Default::default()
327        };
328        let result = indexer.index(root, &options);
329        assert!(matches!(result, Err(IndexerError::FileCountLimitExceeded)));
330    }
331
332    #[test]
333    fn test_generates_correct_hash() {
334        let dir = setup_workspace();
335        let root = dir.path();
336        fs::write(root.join("test.txt"), "hello world").unwrap();
337
338        let indexer = DefaultIndexer;
339        let results = indexer.index(root, &IndexerOptions::default()).unwrap();
340        assert_eq!(results.len(), 1);
341        assert_eq!(
342            results[0].hash,
343            "b94d27b9934d3e08a52e52d7da7dabfac484efe37a5380ee9088f7ace2efcde9"
344        );
345    }
346}