Skip to main content

codebones_core/
indexer.rs

1use ignore::WalkBuilder;
2use sha2::{Digest, Sha256};
3use std::fs::File;
4use std::io::Read;
5use std::path::{Path, PathBuf};
6
7/// Represents a successfully indexed and hashed file.
8#[derive(Debug, Clone, PartialEq, Eq)]
9pub struct FileHash {
10    pub path: PathBuf, // Relative to the workspace root
11    pub hash: String,  // Hex-encoded SHA-256 hash
12}
13
14/// Configuration options for the indexer.
15#[derive(Debug, Clone)]
16pub struct IndexerOptions {
17    pub max_file_size_bytes: u64,           // Default: 500 KB
18    pub max_file_count: usize,              // Default: 500
19    pub follow_symlinks: bool,              // Default: false
20    pub respect_gitignore: bool,            // Default: true
21    pub custom_ignore_file: Option<String>, // e.g., ".codebonesignore"
22}
23
24impl Default for IndexerOptions {
25    fn default() -> Self {
26        Self {
27            max_file_size_bytes: 500 * 1024,
28            max_file_count: 100000,
29            follow_symlinks: false,
30            respect_gitignore: true,
31            custom_ignore_file: Some(".codebonesignore".to_string()),
32        }
33    }
34}
35
36/// The core indexer trait.
37pub trait Indexer {
38    /// Indexes the given workspace path and returns a list of file hashes.
39    fn index(
40        &self,
41        workspace_root: &Path,
42        options: &IndexerOptions,
43    ) -> Result<Vec<FileHash>, IndexerError>;
44}
45
46/// Errors that can occur during indexing.
47#[derive(Debug, thiserror::Error)]
48pub enum IndexerError {
49    #[error("Path traversal detected: {0}")]
50    PathTraversal(PathBuf),
51    #[error("Symlink escape detected: {0}")]
52    SymlinkEscape(PathBuf),
53    #[error("IO error: {0}")]
54    Io(#[from] std::io::Error),
55    #[error("File count limit exceeded")]
56    FileCountLimitExceeded,
57}
58
59pub struct DefaultIndexer;
60
61impl Indexer for DefaultIndexer {
62    fn index(
63        &self,
64        workspace_root: &Path,
65        options: &IndexerOptions,
66    ) -> Result<Vec<FileHash>, IndexerError> {
67        let mut results = Vec::new();
68        let mut count = 0;
69
70        let mut builder = WalkBuilder::new(workspace_root);
71        builder.follow_links(options.follow_symlinks);
72        builder.git_ignore(options.respect_gitignore);
73        builder.git_exclude(options.respect_gitignore);
74        builder.git_global(options.respect_gitignore);
75        builder.ignore(options.respect_gitignore);
76        builder.require_git(false);
77
78        if let Some(ref custom) = options.custom_ignore_file {
79            builder.add_custom_ignore_filename(custom);
80        }
81
82        let walker = builder.build();
83
84        let canonical_root = std::fs::canonicalize(workspace_root)?;
85
86        for result in walker {
87            let entry = match result {
88                Ok(e) => e,
89                Err(_) => continue,
90            };
91
92            let path = entry.path();
93            if path.is_dir() {
94                continue;
95            }
96
97            // Path traversal check
98            let canonical_path = match std::fs::canonicalize(path) {
99                Ok(p) => p,
100                Err(_) => continue, // Skip broken symlinks or missing files
101            };
102            if !canonical_path.starts_with(&canonical_root) {
103                return Err(IndexerError::PathTraversal(path.to_path_buf()));
104            }
105
106            // Symlink policy:
107            //   - follow_symlinks=false (default): skip symlinks silently.
108            //   - follow_symlinks=true: symlinks that escape the root are already
109            //     rejected by the PathTraversal check above; symlinks inside the
110            //     root are allowed through.
111            if entry.path_is_symlink() && !options.follow_symlinks {
112                continue; // Skip symlinks when not following
113            }
114
115            // Secret exclusion
116            let file_name = path.file_name().unwrap_or_default().to_string_lossy();
117            if file_name == ".env"
118                || file_name.starts_with(".env.")
119                || file_name == ".envrc"
120                || file_name.ends_with(".pem")
121                || file_name.ends_with(".key")
122                || file_name.ends_with(".tfvars")
123                || file_name.ends_with(".p12")
124                || file_name.ends_with(".pfx")
125                || file_name.ends_with(".jks")
126                || file_name.starts_with("id_rsa")
127                || file_name.starts_with("id_ed25519")
128                || file_name == "id_ecdsa"
129                || file_name == "id_dsa"
130                || file_name == "id_ecdsa_sk"
131                || file_name == "id_xmss"
132                || file_name == "credentials.json"
133                || file_name.ends_with(".secrets")
134                || file_name.ends_with(".token")
135                || file_name == ".npmrc"
136                || file_name == ".netrc"
137            {
138                continue;
139            }
140
141            // Binary detection (extension)
142            let ext = path
143                .extension()
144                .unwrap_or_default()
145                .to_string_lossy()
146                .to_lowercase();
147            if [
148                "exe", "dll", "so", "png", "jpg", "jpeg", "pdf", "db", "sqlite", "wasm",
149            ]
150            .contains(&ext.as_str())
151            {
152                continue;
153            }
154
155            // Size limit
156            let metadata = match std::fs::metadata(path) {
157                Ok(metadata) => metadata,
158                Err(error) if error.kind() == std::io::ErrorKind::PermissionDenied => continue,
159                Err(error) => return Err(error.into()),
160            };
161            if metadata.len() > options.max_file_size_bytes {
162                continue;
163            }
164
165            // Binary detection (null bytes) and PEM credential detection
166            let mut file = match File::open(path) {
167                Ok(file) => file,
168                Err(error) if error.kind() == std::io::ErrorKind::PermissionDenied => continue,
169                Err(error) => return Err(error.into()),
170            };
171            let mut buffer = [0; 8192];
172            let bytes_read = match file.read(&mut buffer) {
173                Ok(bytes_read) => bytes_read,
174                Err(error) if error.kind() == std::io::ErrorKind::PermissionDenied => continue,
175                Err(error) => return Err(error.into()),
176            };
177            let chunk = &buffer[..bytes_read];
178            if chunk.contains(&0) {
179                continue;
180            }
181            // Skip PEM-encoded credential files (private keys, certificates, etc.)
182            if chunk.windows(11).any(|w| w == b"-----BEGIN ") {
183                continue;
184            }
185
186            // Hash
187            let mut hasher = Sha256::new();
188            let mut file = match File::open(path) {
189                Ok(file) => file,
190                Err(error) if error.kind() == std::io::ErrorKind::PermissionDenied => continue,
191                Err(error) => return Err(error.into()),
192            };
193            match std::io::copy(&mut file, &mut hasher) {
194                Ok(_) => {}
195                Err(error) if error.kind() == std::io::ErrorKind::PermissionDenied => continue,
196                Err(error) => return Err(error.into()),
197            }
198            let hash = hex::encode(hasher.finalize());
199
200            let rel_path = path
201                .strip_prefix(workspace_root)
202                .unwrap_or(path)
203                .to_path_buf();
204
205            results.push(FileHash {
206                path: rel_path,
207                hash,
208            });
209
210            count += 1;
211            if count >= options.max_file_count {
212                return Err(IndexerError::FileCountLimitExceeded);
213            }
214        }
215
216        Ok(results)
217    }
218}
219
220#[cfg(test)]
221mod tests {
222    use super::*;
223    use std::fs::{self, File};
224    use std::io::Write;
225    use tempfile::TempDir;
226
227    fn setup_workspace() -> TempDir {
228        TempDir::new().unwrap()
229    }
230
231    #[test]
232    fn test_skips_symlinks_escaping_root() {
233        let dir = setup_workspace();
234        let root = dir.path();
235
236        let out_dir = TempDir::new().unwrap();
237        let out_file = out_dir.path().join("out.txt");
238        fs::write(&out_file, "out").unwrap();
239
240        let symlink_path = root.join("link");
241        #[cfg(unix)]
242        std::os::unix::fs::symlink(&out_file, &symlink_path).unwrap();
243
244        let indexer = DefaultIndexer;
245        let options = IndexerOptions {
246            follow_symlinks: true,
247            ..Default::default()
248        };
249
250        let result = indexer.index(root, &options);
251        assert!(matches!(result, Err(IndexerError::PathTraversal(_))));
252    }
253
254    #[test]
255    fn test_ignores_env_and_secret_files() {
256        let dir = setup_workspace();
257        let root = dir.path();
258        fs::write(root.join(".env"), "secret").unwrap();
259        fs::write(root.join("id_rsa"), "secret").unwrap();
260        fs::write(root.join("config.pem"), "secret").unwrap();
261        fs::write(root.join("normal.txt"), "normal").unwrap();
262
263        let indexer = DefaultIndexer;
264        let results = indexer.index(root, &IndexerOptions::default()).unwrap();
265        assert_eq!(results.len(), 1);
266        assert_eq!(results[0].path, PathBuf::from("normal.txt"));
267    }
268
269    #[test]
270    fn test_ignores_gitignore() {
271        let dir = setup_workspace();
272        let root = dir.path();
273        fs::create_dir(root.join("ignored_dir")).unwrap();
274        fs::write(root.join("ignored_dir/test.txt"), "ignored").unwrap();
275        fs::write(root.join(".gitignore"), "ignored_dir/").unwrap();
276
277        let indexer = DefaultIndexer;
278        let results = indexer.index(root, &IndexerOptions::default()).unwrap();
279        assert!(results.iter().all(|r| !r.path.starts_with("ignored_dir")));
280    }
281
282    #[test]
283    fn test_ignores_codebonesignore() {
284        let dir = setup_workspace();
285        let root = dir.path();
286        fs::create_dir(root.join("drafts")).unwrap();
287        fs::write(root.join("drafts/test.txt"), "ignored").unwrap();
288        fs::write(root.join(".codebonesignore"), "drafts/").unwrap();
289
290        let indexer = DefaultIndexer;
291        let results = indexer.index(root, &IndexerOptions::default()).unwrap();
292        assert!(results.iter().all(|r| !r.path.starts_with("drafts")));
293    }
294
295    #[test]
296    fn test_skips_large_files() {
297        let dir = setup_workspace();
298        let root = dir.path();
299        let mut file = File::create(root.join("large.txt")).unwrap();
300        file.write_all(&vec![b'a'; 600 * 1024]).unwrap();
301
302        let indexer = DefaultIndexer;
303        let options = IndexerOptions {
304            max_file_size_bytes: 500 * 1024,
305            ..Default::default()
306        };
307        let results = indexer.index(root, &options).unwrap();
308        assert!(results.is_empty());
309    }
310
311    #[test]
312    fn test_skips_binary_extension() {
313        let dir = setup_workspace();
314        let root = dir.path();
315        fs::write(root.join("test.exe"), "fake binary").unwrap();
316
317        let indexer = DefaultIndexer;
318        let results = indexer.index(root, &IndexerOptions::default()).unwrap();
319        assert!(results.is_empty());
320    }
321
322    #[test]
323    fn test_skips_binary_null_bytes() {
324        let dir = setup_workspace();
325        let root = dir.path();
326        fs::write(root.join("fake.txt"), b"hello\0world").unwrap();
327
328        let indexer = DefaultIndexer;
329        let results = indexer.index(root, &IndexerOptions::default()).unwrap();
330        assert!(results.is_empty());
331    }
332
333    #[test]
334    fn test_replaces_invalid_utf8() {
335        let dir = setup_workspace();
336        let root = dir.path();
337        fs::write(root.join("invalid.txt"), b"hello\xFFworld").unwrap();
338
339        let indexer = DefaultIndexer;
340        let results = indexer.index(root, &IndexerOptions::default()).unwrap();
341        assert_eq!(results.len(), 1);
342    }
343
344    #[test]
345    fn test_stops_at_file_count_limit() {
346        let dir = setup_workspace();
347        let root = dir.path();
348        for i in 0..10 {
349            fs::write(root.join(format!("{}.txt", i)), "test").unwrap();
350        }
351
352        let indexer = DefaultIndexer;
353        let options = IndexerOptions {
354            max_file_count: 5,
355            ..Default::default()
356        };
357        let result = indexer.index(root, &options);
358        assert!(matches!(result, Err(IndexerError::FileCountLimitExceeded)));
359    }
360
361    #[test]
362    fn test_generates_correct_hash() {
363        let dir = setup_workspace();
364        let root = dir.path();
365        fs::write(root.join("test.txt"), "hello world").unwrap();
366
367        let indexer = DefaultIndexer;
368        let results = indexer.index(root, &IndexerOptions::default()).unwrap();
369        assert_eq!(results.len(), 1);
370        assert_eq!(
371            results[0].hash,
372            "b94d27b9934d3e08a52e52d7da7dabfac484efe37a5380ee9088f7ace2efcde9"
373        );
374    }
375
376    // --- Secret file exclusion ---
377
378    #[test]
379    fn test_excludes_dotenv_file() {
380        let dir = setup_workspace();
381        let root = dir.path();
382        fs::write(root.join(".env"), "SECRET=hunter2").unwrap();
383
384        let indexer = DefaultIndexer;
385        let results = indexer.index(root, &IndexerOptions::default()).unwrap();
386        let names: Vec<_> = results
387            .iter()
388            .map(|r| r.path.to_string_lossy().to_string())
389            .collect();
390        assert!(
391            !names.iter().any(|n| n == ".env"),
392            ".env must be excluded, got: {:?}",
393            names
394        );
395    }
396
397    #[test]
398    fn test_excludes_id_rsa_file() {
399        let dir = setup_workspace();
400        let root = dir.path();
401        fs::write(root.join("id_rsa"), "-----BEGIN RSA PRIVATE KEY-----").unwrap();
402
403        let indexer = DefaultIndexer;
404        let results = indexer.index(root, &IndexerOptions::default()).unwrap();
405        let names: Vec<_> = results
406            .iter()
407            .map(|r| r.path.to_string_lossy().to_string())
408            .collect();
409        assert!(
410            !names.iter().any(|n| n == "id_rsa"),
411            "id_rsa must be excluded, got: {:?}",
412            names
413        );
414    }
415
416    #[test]
417    fn test_excludes_credentials_json_file() {
418        let dir = setup_workspace();
419        let root = dir.path();
420        fs::write(root.join("credentials.json"), r#"{"token":"secret"}"#).unwrap();
421
422        let indexer = DefaultIndexer;
423        let results = indexer.index(root, &IndexerOptions::default()).unwrap();
424        let names: Vec<_> = results
425            .iter()
426            .map(|r| r.path.to_string_lossy().to_string())
427            .collect();
428        assert!(
429            !names.iter().any(|n| n == "credentials.json"),
430            "credentials.json must be excluded, got: {:?}",
431            names
432        );
433    }
434
435    #[test]
436    fn test_excludes_pem_header_file() {
437        // Any file whose content begins with a PEM header must be treated as a
438        // credential and excluded, regardless of its filename or extension.
439        let dir = setup_workspace();
440        let root = dir.path();
441        // Use an unusual extension to confirm the check is content-based, not name-based.
442        fs::write(
443            root.join("server.crt"),
444            "-----BEGIN RSA PRIVATE KEY-----\nMIIEowIBAAKCAQEA...\n-----END RSA PRIVATE KEY-----\n",
445        )
446        .unwrap();
447        fs::write(root.join("normal.txt"), "just text").unwrap();
448
449        let indexer = DefaultIndexer;
450        let results = indexer.index(root, &IndexerOptions::default()).unwrap();
451        let names: Vec<_> = results
452            .iter()
453            .map(|r| r.path.to_string_lossy().to_string())
454            .collect();
455        assert!(
456            !names.iter().any(|n| n == "server.crt"),
457            "File with PEM header must be excluded, got: {:?}",
458            names
459        );
460        assert!(
461            names.iter().any(|n| n == "normal.txt"),
462            "normal.txt must still be indexed, got: {:?}",
463            names
464        );
465    }
466
467    #[test]
468    fn test_normal_rs_file_is_not_excluded() {
469        let dir = setup_workspace();
470        let root = dir.path();
471        fs::write(
472            root.join("lib.rs"),
473            "pub fn add(a: i32, b: i32) -> i32 { a + b }",
474        )
475        .unwrap();
476
477        let indexer = DefaultIndexer;
478        let results = indexer.index(root, &IndexerOptions::default()).unwrap();
479        let names: Vec<_> = results
480            .iter()
481            .map(|r| r.path.to_string_lossy().to_string())
482            .collect();
483        assert!(
484            names.iter().any(|n| n == "lib.rs"),
485            "lib.rs must be indexed, got: {:?}",
486            names
487        );
488    }
489
490    // --- Binary file exclusion ---
491
492    #[test]
493    fn test_excludes_exe_extension() {
494        let dir = setup_workspace();
495        let root = dir.path();
496        fs::write(root.join("app.exe"), "MZ fake windows binary").unwrap();
497
498        let indexer = DefaultIndexer;
499        let results = indexer.index(root, &IndexerOptions::default()).unwrap();
500        let names: Vec<_> = results
501            .iter()
502            .map(|r| r.path.to_string_lossy().to_string())
503            .collect();
504        assert!(
505            !names.iter().any(|n| n.ends_with(".exe")),
506            ".exe must be excluded, got: {:?}",
507            names
508        );
509    }
510
511    #[test]
512    fn test_excludes_png_extension() {
513        let dir = setup_workspace();
514        let root = dir.path();
515        // PNG magic bytes header to make it realistic, but content doesn't matter
516        fs::write(root.join("logo.png"), b"\x89PNG\r\n\x1a\nfake image data").unwrap();
517
518        let indexer = DefaultIndexer;
519        let results = indexer.index(root, &IndexerOptions::default()).unwrap();
520        let names: Vec<_> = results
521            .iter()
522            .map(|r| r.path.to_string_lossy().to_string())
523            .collect();
524        assert!(
525            !names.iter().any(|n| n.ends_with(".png")),
526            ".png must be excluded, got: {:?}",
527            names
528        );
529    }
530
531    #[test]
532    fn test_excludes_source_file_with_null_bytes() {
533        // A file with a .rs extension but containing null bytes should be treated
534        // as binary and skipped. This catches embedded binaries misnamed as source.
535        let dir = setup_workspace();
536        let root = dir.path();
537        let mut content = b"fn main() { println!(\"hello\"); }\n".to_vec();
538        content.push(0x00); // inject a null byte
539        content.extend_from_slice(b" // more code");
540        fs::write(root.join("tricky.rs"), &content).unwrap();
541
542        let indexer = DefaultIndexer;
543        let results = indexer.index(root, &IndexerOptions::default()).unwrap();
544        let names: Vec<_> = results
545            .iter()
546            .map(|r| r.path.to_string_lossy().to_string())
547            .collect();
548        assert!(
549            !names.iter().any(|n| n == "tricky.rs"),
550            "Source file with null bytes must be excluded, got: {:?}",
551            names
552        );
553    }
554
555    // --- Glob filtering via ignore file ---
556
557    #[test]
558    fn test_codebonesignore_glob_excludes_toml_files() {
559        // Simulate "--ignore *.toml" by writing a .codebonesignore with a glob pattern
560        let dir = setup_workspace();
561        let root = dir.path();
562        fs::write(root.join("Cargo.toml"), "[package]\nname = \"test\"").unwrap();
563        fs::write(root.join("main.rs"), "fn main() {}").unwrap();
564        fs::write(root.join(".codebonesignore"), "*.toml\n").unwrap();
565
566        let indexer = DefaultIndexer;
567        let results = indexer.index(root, &IndexerOptions::default()).unwrap();
568        let names: Vec<_> = results
569            .iter()
570            .map(|r| r.path.to_string_lossy().to_string())
571            .collect();
572
573        assert!(
574            !names.iter().any(|n| n.ends_with(".toml")),
575            "*.toml files must be excluded via .codebonesignore, got: {:?}",
576            names
577        );
578        assert!(
579            names.iter().any(|n| n == "main.rs"),
580            "main.rs must still be indexed, got: {:?}",
581            names
582        );
583    }
584
585    #[test]
586    fn test_gitignore_glob_excludes_matching_files() {
587        // Simulate "--ignore *.log" by writing a .gitignore with a glob pattern
588        let dir = setup_workspace();
589        let root = dir.path();
590        fs::write(root.join("app.log"), "INFO: server started").unwrap();
591        fs::write(root.join("server.rs"), "fn serve() {}").unwrap();
592        fs::write(root.join(".gitignore"), "*.log\n").unwrap();
593
594        let indexer = DefaultIndexer;
595        let results = indexer.index(root, &IndexerOptions::default()).unwrap();
596        let names: Vec<_> = results
597            .iter()
598            .map(|r| r.path.to_string_lossy().to_string())
599            .collect();
600
601        assert!(
602            !names.iter().any(|n| n.ends_with(".log")),
603            "*.log files must be excluded via .gitignore, got: {:?}",
604            names
605        );
606    }
607
608    #[test]
609    fn test_only_rs_files_indexed_when_all_others_ignored() {
610        // Simulate "--include *.rs only" by ignoring everything else via .codebonesignore
611        let dir = setup_workspace();
612        let root = dir.path();
613        fs::write(root.join("main.rs"), "fn main() {}").unwrap();
614        fs::write(root.join("readme.md"), "# Project").unwrap();
615        fs::write(root.join("config.yaml"), "key: value").unwrap();
616        // Use .codebonesignore to exclude non-Rust files
617        fs::write(root.join(".codebonesignore"), "*.md\n*.yaml\n").unwrap();
618
619        let indexer = DefaultIndexer;
620        let results = indexer.index(root, &IndexerOptions::default()).unwrap();
621        let names: Vec<_> = results
622            .iter()
623            .map(|r| r.path.to_string_lossy().to_string())
624            .collect();
625
626        for name in &names {
627            assert!(
628                name.ends_with(".rs"),
629                "Only .rs files should be indexed, but found: {}",
630                name
631            );
632        }
633        assert!(
634            names.iter().any(|n| n == "main.rs"),
635            "main.rs must be in results"
636        );
637    }
638
639    // --- Path traversal security test ---
640
641    #[test]
642    fn test_path_traversal_outside_root_is_rejected_or_absent() {
643        // Create a workspace root and a separate directory outside it.
644        // Attempt to index a path that canonically lives outside the root.
645        // The indexer must either return an error or produce no results
646        // referencing paths outside the workspace root.
647        let workspace = TempDir::new().unwrap();
648        let outside = TempDir::new().unwrap();
649
650        // Write a file in the workspace
651        fs::write(workspace.path().join("inside.txt"), "safe content").unwrap();
652
653        // Write a file outside the workspace
654        fs::write(outside.path().join("outside.txt"), "secret content").unwrap();
655
656        // Attempt to index using a symlink that escapes the workspace root
657        // (only possible on Unix; on Windows the symlink call is a no-op and we
658        // just verify the walker doesn't traverse outside on its own)
659        #[cfg(unix)]
660        {
661            let link_path = workspace.path().join("escape_link");
662            std::os::unix::fs::symlink(outside.path().join("outside.txt"), &link_path).unwrap();
663
664            let indexer = DefaultIndexer;
665            // With follow_symlinks=false (default) the symlink is either skipped
666            // (Ok with no escaping entry) or rejected outright (Err PathTraversal).
667            // Both are correct — the escaping file must never appear in results.
668            let result = indexer.index(workspace.path(), &IndexerOptions::default());
669            let files = match result {
670                Ok(f) => f,
671                Err(IndexerError::PathTraversal(_)) | Err(IndexerError::SymlinkEscape(_)) => {
672                    vec![] // rejected at the gate — correct behaviour
673                }
674                Err(e) => panic!("Unexpected error with follow_symlinks=false: {}", e),
675            };
676
677            let outside_root = outside.path();
678            for fh in &files {
679                let absolute = workspace.path().join(&fh.path);
680                assert!(
681                    absolute.starts_with(workspace.path()),
682                    "Traversal detected: {:?} is outside {:?}",
683                    absolute,
684                    workspace.path()
685                );
686                assert_ne!(
687                    fh.path.to_string_lossy().as_ref(),
688                    "escape_link",
689                    "Symlink pointing outside root must not be indexed"
690                );
691                let _ = outside_root;
692            }
693        }
694
695        // When follow_symlinks=true, the indexer is expected to return an error
696        // for paths that escape the workspace root.
697        #[cfg(unix)]
698        {
699            let link_path2 = workspace.path().join("escape_link2");
700            // Only create if it doesn't already exist (test may run twice in parallel)
701            if !link_path2.exists() {
702                std::os::unix::fs::symlink(outside.path().join("outside.txt"), &link_path2)
703                    .unwrap();
704            }
705            let indexer = DefaultIndexer;
706            let options = IndexerOptions {
707                follow_symlinks: true,
708                ..Default::default()
709            };
710            let result = indexer.index(workspace.path(), &options);
711            // Must either be an error (PathTraversal/SymlinkEscape) or not include
712            // files that canonically live outside the workspace.
713            match result {
714                Err(IndexerError::PathTraversal(_)) | Err(IndexerError::SymlinkEscape(_)) => {
715                    // Correct: traversal detected and rejected
716                }
717                Ok(files) => {
718                    for fh in &files {
719                        let absolute = workspace.path().join(&fh.path);
720                        assert!(
721                            absolute.starts_with(workspace.path()),
722                            "Returned file escapes workspace: {:?}",
723                            absolute
724                        );
725                    }
726                }
727                Err(other) => panic!("Unexpected error: {}", other),
728            }
729        }
730    }
731
732    // --- Incremental indexing ---
733
734    #[test]
735    fn test_large_file_at_limit_is_indexed_small_file_over_limit_is_skipped() {
736        // The indexer uses `metadata.len() > max_file_size_bytes` (strict greater-than),
737        // so a file of exactly max_file_size_bytes is INCLUDED; one of max+1 is EXCLUDED.
738        let dir = setup_workspace();
739        let root = dir.path();
740
741        let max_size: u64 = 500 * 1024; // 512_000 bytes — the default limit
742
743        // File exactly AT the limit — should be indexed (not greater-than, so passes the check)
744        let at_limit_path = root.join("at_limit.txt");
745        let mut at_limit = File::create(&at_limit_path).unwrap();
746        at_limit.write_all(&vec![b'a'; max_size as usize]).unwrap();
747
748        // File one byte OVER the limit — should be skipped
749        let over_limit_path = root.join("over_limit.txt");
750        let mut over_limit = File::create(&over_limit_path).unwrap();
751        over_limit
752            .write_all(&vec![b'b'; max_size as usize + 1])
753            .unwrap();
754
755        let indexer = DefaultIndexer;
756        let options = IndexerOptions {
757            max_file_size_bytes: max_size,
758            respect_gitignore: false,
759            ..Default::default()
760        };
761
762        let results = indexer.index(root, &options).unwrap();
763        let names: Vec<String> = results
764            .iter()
765            .map(|r| r.path.to_string_lossy().to_string())
766            .collect();
767
768        assert!(
769            names.iter().any(|n| n == "at_limit.txt"),
770            "File of exactly max_file_size_bytes should be indexed (boundary is exclusive); got: {:?}",
771            names
772        );
773
774        assert!(
775            !names.iter().any(|n| n == "over_limit.txt"),
776            "File of max_file_size_bytes + 1 should NOT be indexed; got: {:?}",
777            names
778        );
779    }
780
781    #[test]
782    fn test_incremental_indexing_only_changed_file_has_new_hash() {
783        use std::collections::HashMap;
784
785        let dir = setup_workspace();
786        let root = dir.path();
787
788        // Write two files
789        fs::write(root.join("stable.rs"), "fn stable() {}").unwrap();
790        fs::write(root.join("volatile.rs"), "fn original() {}").unwrap();
791
792        let indexer = DefaultIndexer;
793        let options = IndexerOptions {
794            respect_gitignore: false,
795            ..Default::default()
796        };
797
798        // First index pass: record all hashes
799        let first_results = indexer.index(root, &options).unwrap();
800        let first_hashes: HashMap<String, String> = first_results
801            .iter()
802            .map(|fh| (fh.path.to_string_lossy().to_string(), fh.hash.clone()))
803            .collect();
804
805        assert!(
806            first_hashes.contains_key("stable.rs"),
807            "stable.rs must be in first index"
808        );
809        assert!(
810            first_hashes.contains_key("volatile.rs"),
811            "volatile.rs must be in first index"
812        );
813
814        // Modify only volatile.rs
815        fs::write(root.join("volatile.rs"), "fn modified() {}").unwrap();
816
817        // Second index pass
818        let second_results = indexer.index(root, &options).unwrap();
819        let second_hashes: HashMap<String, String> = second_results
820            .iter()
821            .map(|fh| (fh.path.to_string_lossy().to_string(), fh.hash.clone()))
822            .collect();
823
824        // stable.rs hash must be unchanged
825        assert_eq!(
826            first_hashes["stable.rs"], second_hashes["stable.rs"],
827            "stable.rs hash must not change between index passes"
828        );
829
830        // volatile.rs hash must have changed
831        assert_ne!(
832            first_hashes["volatile.rs"], second_hashes["volatile.rs"],
833            "volatile.rs hash must change after file modification"
834        );
835    }
836
837    // --- New credential exclusion tests (Gap 3) ---
838
839    #[test]
840    fn test_excludes_id_ecdsa_file() {
841        let dir = setup_workspace();
842        let root = dir.path();
843        fs::write(root.join("id_ecdsa"), "-----BEGIN EC PRIVATE KEY-----").unwrap();
844
845        let indexer = DefaultIndexer;
846        let results = indexer.index(root, &IndexerOptions::default()).unwrap();
847        assert!(
848            results.is_empty(),
849            "id_ecdsa must be excluded, got: {:?}",
850            results
851        );
852    }
853
854    #[test]
855    fn test_excludes_tfvars_file() {
856        let dir = setup_workspace();
857        let root = dir.path();
858        fs::write(root.join("terraform.tfvars"), "db_password = \"secret\"").unwrap();
859
860        let indexer = DefaultIndexer;
861        let results = indexer.index(root, &IndexerOptions::default()).unwrap();
862        assert!(
863            results.is_empty(),
864            "terraform.tfvars must be excluded, got: {:?}",
865            results
866        );
867    }
868
869    #[test]
870    fn test_excludes_p12_file() {
871        let dir = setup_workspace();
872        let root = dir.path();
873        fs::write(root.join("keystore.p12"), b"fake pkcs12 binary bytes").unwrap();
874
875        let indexer = DefaultIndexer;
876        let results = indexer.index(root, &IndexerOptions::default()).unwrap();
877        assert!(
878            results.is_empty(),
879            "keystore.p12 must be excluded, got: {:?}",
880            results
881        );
882    }
883
884    #[test]
885    fn test_indexes_crt_file_without_pem_header() {
886        let dir = setup_workspace();
887        let root = dir.path();
888        fs::write(root.join("cert.crt"), "CERTIFICATE DATA WITHOUT PEM HEADER").unwrap();
889
890        let indexer = DefaultIndexer;
891        let results = indexer.index(root, &IndexerOptions::default()).unwrap();
892        let names: Vec<_> = results
893            .iter()
894            .map(|r| r.path.to_string_lossy().to_string())
895            .collect();
896        assert!(
897            names.iter().any(|n| n == "cert.crt"),
898            "cert.crt without a PEM header must be indexed, got: {:?}",
899            names
900        );
901    }
902
903    #[test]
904    fn test_excludes_crt_file_with_pem_header() {
905        let dir = setup_workspace();
906        let root = dir.path();
907        fs::write(
908            root.join("cert.crt"),
909            "-----BEGIN CERTIFICATE-----\nMIIB...",
910        )
911        .unwrap();
912
913        let indexer = DefaultIndexer;
914        let results = indexer.index(root, &IndexerOptions::default()).unwrap();
915        let names: Vec<_> = results
916            .iter()
917            .map(|r| r.path.to_string_lossy().to_string())
918            .collect();
919        assert!(
920            !names.iter().any(|n| n == "cert.crt"),
921            "cert.crt with a PEM header must be excluded, got: {:?}",
922            names
923        );
924    }
925
926    // --- Symlink within root behavior test (Gap 4) ---
927
928    #[test]
929    #[cfg(unix)]
930    fn test_symlink_within_root_indexed_with_follow_symlinks() {
931        // Documents the current behavior: with follow_symlinks=true, symlinks that
932        // resolve within the root are allowed through and indexed. Only symlinks that
933        // escape the root produce a PathTraversal error.
934        let dir = setup_workspace();
935        let root = dir.path();
936
937        // Create a real file inside the root.
938        let real_file = root.join("real.rs");
939        fs::write(&real_file, "fn real() {}").unwrap();
940
941        // Create a symlink inside the root that points to the real file (still within root).
942        let symlink_path = root.join("link_to_real.rs");
943        std::os::unix::fs::symlink(&real_file, &symlink_path).unwrap();
944
945        let indexer = DefaultIndexer;
946        let options = IndexerOptions {
947            follow_symlinks: true,
948            ..Default::default()
949        };
950
951        // With follow_symlinks=true, a within-root symlink must be indexed successfully.
952        let result = indexer.index(root, &options);
953        assert!(
954            result.is_ok(),
955            "with follow_symlinks=true, a symlink inside the root must be indexed (not errored); got: {:?}",
956            result
957        );
958        let files = result.unwrap();
959        let names: Vec<_> = files
960            .iter()
961            .map(|r| r.path.to_string_lossy().to_string())
962            .collect();
963        assert!(
964            names.iter().any(|n| n == "link_to_real.rs"),
965            "the within-root symlink must appear in indexed results; got: {:?}",
966            names
967        );
968    }
969}