Skip to main content

codebones_core/
indexer.rs

1use ignore::WalkBuilder;
2use sha2::{Digest, Sha256};
3use std::fs::File;
4use std::io::Read;
5use std::path::{Path, PathBuf};
6
7/// Represents a successfully indexed and hashed file.
8#[derive(Debug, Clone, PartialEq, Eq)]
9pub struct FileHash {
10    pub path: PathBuf, // Relative to the workspace root
11    pub hash: String,  // Hex-encoded SHA-256 hash
12}
13
14/// Configuration options for the indexer.
15#[derive(Debug, Clone)]
16pub struct IndexerOptions {
17    pub max_file_size_bytes: u64,           // Default: 500 KB
18    pub max_file_count: usize,              // Default: 500
19    pub follow_symlinks: bool,              // Default: false
20    pub respect_gitignore: bool,            // Default: true
21    pub custom_ignore_file: Option<String>, // e.g., ".codebonesignore"
22}
23
24impl Default for IndexerOptions {
25    fn default() -> Self {
26        Self {
27            max_file_size_bytes: 500 * 1024,
28            max_file_count: 100000,
29            follow_symlinks: false,
30            respect_gitignore: true,
31            custom_ignore_file: Some(".codebonesignore".to_string()),
32        }
33    }
34}
35
36/// The core indexer trait.
37pub trait Indexer {
38    /// Indexes the given workspace path and returns a list of file hashes.
39    fn index(
40        &self,
41        workspace_root: &Path,
42        options: &IndexerOptions,
43    ) -> Result<Vec<FileHash>, IndexerError>;
44}
45
46/// Errors that can occur during indexing.
47#[derive(Debug, thiserror::Error)]
48pub enum IndexerError {
49    #[error("Path traversal detected: {0}")]
50    PathTraversal(PathBuf),
51    #[error("Symlink escape detected: {0}")]
52    SymlinkEscape(PathBuf),
53    #[error("IO error: {0}")]
54    Io(#[from] std::io::Error),
55    #[error("File count limit exceeded")]
56    FileCountLimitExceeded,
57}
58
59pub struct DefaultIndexer;
60
61impl Indexer for DefaultIndexer {
62    fn index(
63        &self,
64        workspace_root: &Path,
65        options: &IndexerOptions,
66    ) -> Result<Vec<FileHash>, IndexerError> {
67        let mut results = Vec::new();
68        let mut count = 0;
69
70        let mut builder = WalkBuilder::new(workspace_root);
71        builder.follow_links(options.follow_symlinks);
72        builder.git_ignore(options.respect_gitignore);
73        builder.git_exclude(options.respect_gitignore);
74        builder.git_global(options.respect_gitignore);
75        builder.ignore(options.respect_gitignore);
76        builder.require_git(false);
77
78        if let Some(ref custom) = options.custom_ignore_file {
79            builder.add_custom_ignore_filename(custom);
80        }
81
82        let walker = builder.build();
83
84        let canonical_root = std::fs::canonicalize(workspace_root)?;
85
86        for result in walker {
87            let entry = match result {
88                Ok(e) => e,
89                Err(_) => continue,
90            };
91
92            let path = entry.path();
93            if path.is_dir() {
94                continue;
95            }
96
97            // Path traversal check
98            let canonical_path = match std::fs::canonicalize(path) {
99                Ok(p) => p,
100                Err(_) => continue, // Skip broken symlinks or missing files
101            };
102            if !canonical_path.starts_with(&canonical_root) {
103                return Err(IndexerError::PathTraversal(path.to_path_buf()));
104            }
105
106            // Symlink policy:
107            //   - follow_symlinks=false (default): skip symlinks silently.
108            //   - follow_symlinks=true: symlinks that escape the root are already
109            //     rejected by the PathTraversal check above; symlinks inside the
110            //     root are allowed through.
111            if entry.path_is_symlink() && !options.follow_symlinks {
112                continue; // Skip symlinks when not following
113            }
114
115            // Secret exclusion
116            let file_name = path.file_name().unwrap_or_default().to_string_lossy();
117            if file_name == ".env"
118                || file_name.starts_with(".env.")
119                || file_name == ".envrc"
120                || file_name.ends_with(".pem")
121                || file_name.ends_with(".key")
122                || file_name.ends_with(".tfvars")
123                || file_name.ends_with(".p12")
124                || file_name.ends_with(".pfx")
125                || file_name.ends_with(".jks")
126                || file_name.starts_with("id_rsa")
127                || file_name.starts_with("id_ed25519")
128                || file_name == "id_ecdsa"
129                || file_name == "id_dsa"
130                || file_name == "id_ecdsa_sk"
131                || file_name == "id_xmss"
132                || file_name == "credentials.json"
133                || file_name.ends_with(".secrets")
134                || file_name.ends_with(".token")
135                || file_name == ".npmrc"
136                || file_name == ".netrc"
137            {
138                continue;
139            }
140
141            // Binary detection (extension)
142            let ext = path
143                .extension()
144                .unwrap_or_default()
145                .to_string_lossy()
146                .to_lowercase();
147            if [
148                "exe", "dll", "so", "png", "jpg", "jpeg", "pdf", "db", "sqlite", "wasm",
149            ]
150            .contains(&ext.as_str())
151            {
152                continue;
153            }
154
155            // Size limit
156            let metadata = std::fs::metadata(path)?;
157            if metadata.len() > options.max_file_size_bytes {
158                continue;
159            }
160
161            // Binary detection (null bytes) and PEM credential detection
162            let mut file = File::open(path)?;
163            let mut buffer = [0; 8192];
164            let bytes_read = file.read(&mut buffer)?;
165            let chunk = &buffer[..bytes_read];
166            if chunk.contains(&0) {
167                continue;
168            }
169            // Skip PEM-encoded credential files (private keys, certificates, etc.)
170            if chunk.windows(11).any(|w| w == b"-----BEGIN ") {
171                continue;
172            }
173
174            // Hash
175            let mut hasher = Sha256::new();
176            let mut file = File::open(path)?;
177            std::io::copy(&mut file, &mut hasher)?;
178            let hash = hex::encode(hasher.finalize());
179
180            let rel_path = path
181                .strip_prefix(workspace_root)
182                .unwrap_or(path)
183                .to_path_buf();
184
185            results.push(FileHash {
186                path: rel_path,
187                hash,
188            });
189
190            count += 1;
191            if count >= options.max_file_count {
192                return Err(IndexerError::FileCountLimitExceeded);
193            }
194        }
195
196        Ok(results)
197    }
198}
199
200#[cfg(test)]
201mod tests {
202    use super::*;
203    use std::fs::{self, File};
204    use std::io::Write;
205    use tempfile::TempDir;
206
207    fn setup_workspace() -> TempDir {
208        TempDir::new().unwrap()
209    }
210
211    #[test]
212    fn test_skips_symlinks_escaping_root() {
213        let dir = setup_workspace();
214        let root = dir.path();
215
216        let out_dir = TempDir::new().unwrap();
217        let out_file = out_dir.path().join("out.txt");
218        fs::write(&out_file, "out").unwrap();
219
220        let symlink_path = root.join("link");
221        #[cfg(unix)]
222        std::os::unix::fs::symlink(&out_file, &symlink_path).unwrap();
223
224        let indexer = DefaultIndexer;
225        let options = IndexerOptions {
226            follow_symlinks: true,
227            ..Default::default()
228        };
229
230        let result = indexer.index(root, &options);
231        assert!(matches!(result, Err(IndexerError::PathTraversal(_))));
232    }
233
234    #[test]
235    fn test_ignores_env_and_secret_files() {
236        let dir = setup_workspace();
237        let root = dir.path();
238        fs::write(root.join(".env"), "secret").unwrap();
239        fs::write(root.join("id_rsa"), "secret").unwrap();
240        fs::write(root.join("config.pem"), "secret").unwrap();
241        fs::write(root.join("normal.txt"), "normal").unwrap();
242
243        let indexer = DefaultIndexer;
244        let results = indexer.index(root, &IndexerOptions::default()).unwrap();
245        assert_eq!(results.len(), 1);
246        assert_eq!(results[0].path, PathBuf::from("normal.txt"));
247    }
248
249    #[test]
250    fn test_ignores_gitignore() {
251        let dir = setup_workspace();
252        let root = dir.path();
253        fs::create_dir(root.join("ignored_dir")).unwrap();
254        fs::write(root.join("ignored_dir/test.txt"), "ignored").unwrap();
255        fs::write(root.join(".gitignore"), "ignored_dir/").unwrap();
256
257        let indexer = DefaultIndexer;
258        let results = indexer.index(root, &IndexerOptions::default()).unwrap();
259        assert!(results.iter().all(|r| !r.path.starts_with("ignored_dir")));
260    }
261
262    #[test]
263    fn test_ignores_codebonesignore() {
264        let dir = setup_workspace();
265        let root = dir.path();
266        fs::create_dir(root.join("drafts")).unwrap();
267        fs::write(root.join("drafts/test.txt"), "ignored").unwrap();
268        fs::write(root.join(".codebonesignore"), "drafts/").unwrap();
269
270        let indexer = DefaultIndexer;
271        let results = indexer.index(root, &IndexerOptions::default()).unwrap();
272        assert!(results.iter().all(|r| !r.path.starts_with("drafts")));
273    }
274
275    #[test]
276    fn test_skips_large_files() {
277        let dir = setup_workspace();
278        let root = dir.path();
279        let mut file = File::create(root.join("large.txt")).unwrap();
280        file.write_all(&vec![b'a'; 600 * 1024]).unwrap();
281
282        let indexer = DefaultIndexer;
283        let options = IndexerOptions {
284            max_file_size_bytes: 500 * 1024,
285            ..Default::default()
286        };
287        let results = indexer.index(root, &options).unwrap();
288        assert!(results.is_empty());
289    }
290
291    #[test]
292    fn test_skips_binary_extension() {
293        let dir = setup_workspace();
294        let root = dir.path();
295        fs::write(root.join("test.exe"), "fake binary").unwrap();
296
297        let indexer = DefaultIndexer;
298        let results = indexer.index(root, &IndexerOptions::default()).unwrap();
299        assert!(results.is_empty());
300    }
301
302    #[test]
303    fn test_skips_binary_null_bytes() {
304        let dir = setup_workspace();
305        let root = dir.path();
306        fs::write(root.join("fake.txt"), b"hello\0world").unwrap();
307
308        let indexer = DefaultIndexer;
309        let results = indexer.index(root, &IndexerOptions::default()).unwrap();
310        assert!(results.is_empty());
311    }
312
313    #[test]
314    fn test_replaces_invalid_utf8() {
315        let dir = setup_workspace();
316        let root = dir.path();
317        fs::write(root.join("invalid.txt"), b"hello\xFFworld").unwrap();
318
319        let indexer = DefaultIndexer;
320        let results = indexer.index(root, &IndexerOptions::default()).unwrap();
321        assert_eq!(results.len(), 1);
322    }
323
324    #[test]
325    fn test_stops_at_file_count_limit() {
326        let dir = setup_workspace();
327        let root = dir.path();
328        for i in 0..10 {
329            fs::write(root.join(format!("{}.txt", i)), "test").unwrap();
330        }
331
332        let indexer = DefaultIndexer;
333        let options = IndexerOptions {
334            max_file_count: 5,
335            ..Default::default()
336        };
337        let result = indexer.index(root, &options);
338        assert!(matches!(result, Err(IndexerError::FileCountLimitExceeded)));
339    }
340
341    #[test]
342    fn test_generates_correct_hash() {
343        let dir = setup_workspace();
344        let root = dir.path();
345        fs::write(root.join("test.txt"), "hello world").unwrap();
346
347        let indexer = DefaultIndexer;
348        let results = indexer.index(root, &IndexerOptions::default()).unwrap();
349        assert_eq!(results.len(), 1);
350        assert_eq!(
351            results[0].hash,
352            "b94d27b9934d3e08a52e52d7da7dabfac484efe37a5380ee9088f7ace2efcde9"
353        );
354    }
355
356    // --- Secret file exclusion ---
357
358    #[test]
359    fn test_excludes_dotenv_file() {
360        let dir = setup_workspace();
361        let root = dir.path();
362        fs::write(root.join(".env"), "SECRET=hunter2").unwrap();
363
364        let indexer = DefaultIndexer;
365        let results = indexer.index(root, &IndexerOptions::default()).unwrap();
366        let names: Vec<_> = results
367            .iter()
368            .map(|r| r.path.to_string_lossy().to_string())
369            .collect();
370        assert!(
371            !names.iter().any(|n| n == ".env"),
372            ".env must be excluded, got: {:?}",
373            names
374        );
375    }
376
377    #[test]
378    fn test_excludes_id_rsa_file() {
379        let dir = setup_workspace();
380        let root = dir.path();
381        fs::write(root.join("id_rsa"), "-----BEGIN RSA PRIVATE KEY-----").unwrap();
382
383        let indexer = DefaultIndexer;
384        let results = indexer.index(root, &IndexerOptions::default()).unwrap();
385        let names: Vec<_> = results
386            .iter()
387            .map(|r| r.path.to_string_lossy().to_string())
388            .collect();
389        assert!(
390            !names.iter().any(|n| n == "id_rsa"),
391            "id_rsa must be excluded, got: {:?}",
392            names
393        );
394    }
395
396    #[test]
397    fn test_excludes_credentials_json_file() {
398        let dir = setup_workspace();
399        let root = dir.path();
400        fs::write(root.join("credentials.json"), r#"{"token":"secret"}"#).unwrap();
401
402        let indexer = DefaultIndexer;
403        let results = indexer.index(root, &IndexerOptions::default()).unwrap();
404        let names: Vec<_> = results
405            .iter()
406            .map(|r| r.path.to_string_lossy().to_string())
407            .collect();
408        assert!(
409            !names.iter().any(|n| n == "credentials.json"),
410            "credentials.json must be excluded, got: {:?}",
411            names
412        );
413    }
414
415    #[test]
416    fn test_excludes_pem_header_file() {
417        // Any file whose content begins with a PEM header must be treated as a
418        // credential and excluded, regardless of its filename or extension.
419        let dir = setup_workspace();
420        let root = dir.path();
421        // Use an unusual extension to confirm the check is content-based, not name-based.
422        fs::write(
423            root.join("server.crt"),
424            "-----BEGIN RSA PRIVATE KEY-----\nMIIEowIBAAKCAQEA...\n-----END RSA PRIVATE KEY-----\n",
425        )
426        .unwrap();
427        fs::write(root.join("normal.txt"), "just text").unwrap();
428
429        let indexer = DefaultIndexer;
430        let results = indexer.index(root, &IndexerOptions::default()).unwrap();
431        let names: Vec<_> = results
432            .iter()
433            .map(|r| r.path.to_string_lossy().to_string())
434            .collect();
435        assert!(
436            !names.iter().any(|n| n == "server.crt"),
437            "File with PEM header must be excluded, got: {:?}",
438            names
439        );
440        assert!(
441            names.iter().any(|n| n == "normal.txt"),
442            "normal.txt must still be indexed, got: {:?}",
443            names
444        );
445    }
446
447    #[test]
448    fn test_normal_rs_file_is_not_excluded() {
449        let dir = setup_workspace();
450        let root = dir.path();
451        fs::write(
452            root.join("lib.rs"),
453            "pub fn add(a: i32, b: i32) -> i32 { a + b }",
454        )
455        .unwrap();
456
457        let indexer = DefaultIndexer;
458        let results = indexer.index(root, &IndexerOptions::default()).unwrap();
459        let names: Vec<_> = results
460            .iter()
461            .map(|r| r.path.to_string_lossy().to_string())
462            .collect();
463        assert!(
464            names.iter().any(|n| n == "lib.rs"),
465            "lib.rs must be indexed, got: {:?}",
466            names
467        );
468    }
469
470    // --- Binary file exclusion ---
471
472    #[test]
473    fn test_excludes_exe_extension() {
474        let dir = setup_workspace();
475        let root = dir.path();
476        fs::write(root.join("app.exe"), "MZ fake windows binary").unwrap();
477
478        let indexer = DefaultIndexer;
479        let results = indexer.index(root, &IndexerOptions::default()).unwrap();
480        let names: Vec<_> = results
481            .iter()
482            .map(|r| r.path.to_string_lossy().to_string())
483            .collect();
484        assert!(
485            !names.iter().any(|n| n.ends_with(".exe")),
486            ".exe must be excluded, got: {:?}",
487            names
488        );
489    }
490
491    #[test]
492    fn test_excludes_png_extension() {
493        let dir = setup_workspace();
494        let root = dir.path();
495        // PNG magic bytes header to make it realistic, but content doesn't matter
496        fs::write(root.join("logo.png"), b"\x89PNG\r\n\x1a\nfake image data").unwrap();
497
498        let indexer = DefaultIndexer;
499        let results = indexer.index(root, &IndexerOptions::default()).unwrap();
500        let names: Vec<_> = results
501            .iter()
502            .map(|r| r.path.to_string_lossy().to_string())
503            .collect();
504        assert!(
505            !names.iter().any(|n| n.ends_with(".png")),
506            ".png must be excluded, got: {:?}",
507            names
508        );
509    }
510
511    #[test]
512    fn test_excludes_source_file_with_null_bytes() {
513        // A file with a .rs extension but containing null bytes should be treated
514        // as binary and skipped. This catches embedded binaries misnamed as source.
515        let dir = setup_workspace();
516        let root = dir.path();
517        let mut content = b"fn main() { println!(\"hello\"); }\n".to_vec();
518        content.push(0x00); // inject a null byte
519        content.extend_from_slice(b" // more code");
520        fs::write(root.join("tricky.rs"), &content).unwrap();
521
522        let indexer = DefaultIndexer;
523        let results = indexer.index(root, &IndexerOptions::default()).unwrap();
524        let names: Vec<_> = results
525            .iter()
526            .map(|r| r.path.to_string_lossy().to_string())
527            .collect();
528        assert!(
529            !names.iter().any(|n| n == "tricky.rs"),
530            "Source file with null bytes must be excluded, got: {:?}",
531            names
532        );
533    }
534
535    // --- Glob filtering via ignore file ---
536
537    #[test]
538    fn test_codebonesignore_glob_excludes_toml_files() {
539        // Simulate "--ignore *.toml" by writing a .codebonesignore with a glob pattern
540        let dir = setup_workspace();
541        let root = dir.path();
542        fs::write(root.join("Cargo.toml"), "[package]\nname = \"test\"").unwrap();
543        fs::write(root.join("main.rs"), "fn main() {}").unwrap();
544        fs::write(root.join(".codebonesignore"), "*.toml\n").unwrap();
545
546        let indexer = DefaultIndexer;
547        let results = indexer.index(root, &IndexerOptions::default()).unwrap();
548        let names: Vec<_> = results
549            .iter()
550            .map(|r| r.path.to_string_lossy().to_string())
551            .collect();
552
553        assert!(
554            !names.iter().any(|n| n.ends_with(".toml")),
555            "*.toml files must be excluded via .codebonesignore, got: {:?}",
556            names
557        );
558        assert!(
559            names.iter().any(|n| n == "main.rs"),
560            "main.rs must still be indexed, got: {:?}",
561            names
562        );
563    }
564
565    #[test]
566    fn test_gitignore_glob_excludes_matching_files() {
567        // Simulate "--ignore *.log" by writing a .gitignore with a glob pattern
568        let dir = setup_workspace();
569        let root = dir.path();
570        fs::write(root.join("app.log"), "INFO: server started").unwrap();
571        fs::write(root.join("server.rs"), "fn serve() {}").unwrap();
572        fs::write(root.join(".gitignore"), "*.log\n").unwrap();
573
574        let indexer = DefaultIndexer;
575        let results = indexer.index(root, &IndexerOptions::default()).unwrap();
576        let names: Vec<_> = results
577            .iter()
578            .map(|r| r.path.to_string_lossy().to_string())
579            .collect();
580
581        assert!(
582            !names.iter().any(|n| n.ends_with(".log")),
583            "*.log files must be excluded via .gitignore, got: {:?}",
584            names
585        );
586    }
587
588    #[test]
589    fn test_only_rs_files_indexed_when_all_others_ignored() {
590        // Simulate "--include *.rs only" by ignoring everything else via .codebonesignore
591        let dir = setup_workspace();
592        let root = dir.path();
593        fs::write(root.join("main.rs"), "fn main() {}").unwrap();
594        fs::write(root.join("readme.md"), "# Project").unwrap();
595        fs::write(root.join("config.yaml"), "key: value").unwrap();
596        // Use .codebonesignore to exclude non-Rust files
597        fs::write(root.join(".codebonesignore"), "*.md\n*.yaml\n").unwrap();
598
599        let indexer = DefaultIndexer;
600        let results = indexer.index(root, &IndexerOptions::default()).unwrap();
601        let names: Vec<_> = results
602            .iter()
603            .map(|r| r.path.to_string_lossy().to_string())
604            .collect();
605
606        for name in &names {
607            assert!(
608                name.ends_with(".rs"),
609                "Only .rs files should be indexed, but found: {}",
610                name
611            );
612        }
613        assert!(
614            names.iter().any(|n| n == "main.rs"),
615            "main.rs must be in results"
616        );
617    }
618
619    // --- Path traversal security test ---
620
621    #[test]
622    fn test_path_traversal_outside_root_is_rejected_or_absent() {
623        // Create a workspace root and a separate directory outside it.
624        // Attempt to index a path that canonically lives outside the root.
625        // The indexer must either return an error or produce no results
626        // referencing paths outside the workspace root.
627        let workspace = TempDir::new().unwrap();
628        let outside = TempDir::new().unwrap();
629
630        // Write a file in the workspace
631        fs::write(workspace.path().join("inside.txt"), "safe content").unwrap();
632
633        // Write a file outside the workspace
634        fs::write(outside.path().join("outside.txt"), "secret content").unwrap();
635
636        // Attempt to index using a symlink that escapes the workspace root
637        // (only possible on Unix; on Windows the symlink call is a no-op and we
638        // just verify the walker doesn't traverse outside on its own)
639        #[cfg(unix)]
640        {
641            let link_path = workspace.path().join("escape_link");
642            std::os::unix::fs::symlink(outside.path().join("outside.txt"), &link_path).unwrap();
643
644            let indexer = DefaultIndexer;
645            // With follow_symlinks=false (default) the symlink is either skipped
646            // (Ok with no escaping entry) or rejected outright (Err PathTraversal).
647            // Both are correct — the escaping file must never appear in results.
648            let result = indexer.index(workspace.path(), &IndexerOptions::default());
649            let files = match result {
650                Ok(f) => f,
651                Err(IndexerError::PathTraversal(_)) | Err(IndexerError::SymlinkEscape(_)) => {
652                    vec![] // rejected at the gate — correct behaviour
653                }
654                Err(e) => panic!("Unexpected error with follow_symlinks=false: {}", e),
655            };
656
657            let outside_root = outside.path();
658            for fh in &files {
659                let absolute = workspace.path().join(&fh.path);
660                assert!(
661                    absolute.starts_with(workspace.path()),
662                    "Traversal detected: {:?} is outside {:?}",
663                    absolute,
664                    workspace.path()
665                );
666                assert_ne!(
667                    fh.path.to_string_lossy().as_ref(),
668                    "escape_link",
669                    "Symlink pointing outside root must not be indexed"
670                );
671                let _ = outside_root;
672            }
673        }
674
675        // When follow_symlinks=true, the indexer is expected to return an error
676        // for paths that escape the workspace root.
677        #[cfg(unix)]
678        {
679            let link_path2 = workspace.path().join("escape_link2");
680            // Only create if it doesn't already exist (test may run twice in parallel)
681            if !link_path2.exists() {
682                std::os::unix::fs::symlink(outside.path().join("outside.txt"), &link_path2)
683                    .unwrap();
684            }
685            let indexer = DefaultIndexer;
686            let options = IndexerOptions {
687                follow_symlinks: true,
688                ..Default::default()
689            };
690            let result = indexer.index(workspace.path(), &options);
691            // Must either be an error (PathTraversal/SymlinkEscape) or not include
692            // files that canonically live outside the workspace.
693            match result {
694                Err(IndexerError::PathTraversal(_)) | Err(IndexerError::SymlinkEscape(_)) => {
695                    // Correct: traversal detected and rejected
696                }
697                Ok(files) => {
698                    for fh in &files {
699                        let absolute = workspace.path().join(&fh.path);
700                        assert!(
701                            absolute.starts_with(workspace.path()),
702                            "Returned file escapes workspace: {:?}",
703                            absolute
704                        );
705                    }
706                }
707                Err(other) => panic!("Unexpected error: {}", other),
708            }
709        }
710    }
711
712    // --- Incremental indexing ---
713
714    #[test]
715    fn test_large_file_at_limit_is_indexed_small_file_over_limit_is_skipped() {
716        // The indexer uses `metadata.len() > max_file_size_bytes` (strict greater-than),
717        // so a file of exactly max_file_size_bytes is INCLUDED; one of max+1 is EXCLUDED.
718        let dir = setup_workspace();
719        let root = dir.path();
720
721        let max_size: u64 = 500 * 1024; // 512_000 bytes — the default limit
722
723        // File exactly AT the limit — should be indexed (not greater-than, so passes the check)
724        let at_limit_path = root.join("at_limit.txt");
725        let mut at_limit = File::create(&at_limit_path).unwrap();
726        at_limit.write_all(&vec![b'a'; max_size as usize]).unwrap();
727
728        // File one byte OVER the limit — should be skipped
729        let over_limit_path = root.join("over_limit.txt");
730        let mut over_limit = File::create(&over_limit_path).unwrap();
731        over_limit
732            .write_all(&vec![b'b'; max_size as usize + 1])
733            .unwrap();
734
735        let indexer = DefaultIndexer;
736        let options = IndexerOptions {
737            max_file_size_bytes: max_size,
738            respect_gitignore: false,
739            ..Default::default()
740        };
741
742        let results = indexer.index(root, &options).unwrap();
743        let names: Vec<String> = results
744            .iter()
745            .map(|r| r.path.to_string_lossy().to_string())
746            .collect();
747
748        assert!(
749            names.iter().any(|n| n == "at_limit.txt"),
750            "File of exactly max_file_size_bytes should be indexed (boundary is exclusive); got: {:?}",
751            names
752        );
753
754        assert!(
755            !names.iter().any(|n| n == "over_limit.txt"),
756            "File of max_file_size_bytes + 1 should NOT be indexed; got: {:?}",
757            names
758        );
759    }
760
761    #[test]
762    fn test_incremental_indexing_only_changed_file_has_new_hash() {
763        use std::collections::HashMap;
764
765        let dir = setup_workspace();
766        let root = dir.path();
767
768        // Write two files
769        fs::write(root.join("stable.rs"), "fn stable() {}").unwrap();
770        fs::write(root.join("volatile.rs"), "fn original() {}").unwrap();
771
772        let indexer = DefaultIndexer;
773        let options = IndexerOptions {
774            respect_gitignore: false,
775            ..Default::default()
776        };
777
778        // First index pass: record all hashes
779        let first_results = indexer.index(root, &options).unwrap();
780        let first_hashes: HashMap<String, String> = first_results
781            .iter()
782            .map(|fh| (fh.path.to_string_lossy().to_string(), fh.hash.clone()))
783            .collect();
784
785        assert!(
786            first_hashes.contains_key("stable.rs"),
787            "stable.rs must be in first index"
788        );
789        assert!(
790            first_hashes.contains_key("volatile.rs"),
791            "volatile.rs must be in first index"
792        );
793
794        // Modify only volatile.rs
795        fs::write(root.join("volatile.rs"), "fn modified() {}").unwrap();
796
797        // Second index pass
798        let second_results = indexer.index(root, &options).unwrap();
799        let second_hashes: HashMap<String, String> = second_results
800            .iter()
801            .map(|fh| (fh.path.to_string_lossy().to_string(), fh.hash.clone()))
802            .collect();
803
804        // stable.rs hash must be unchanged
805        assert_eq!(
806            first_hashes["stable.rs"], second_hashes["stable.rs"],
807            "stable.rs hash must not change between index passes"
808        );
809
810        // volatile.rs hash must have changed
811        assert_ne!(
812            first_hashes["volatile.rs"], second_hashes["volatile.rs"],
813            "volatile.rs hash must change after file modification"
814        );
815    }
816
817    // --- New credential exclusion tests (Gap 3) ---
818
819    #[test]
820    fn test_excludes_id_ecdsa_file() {
821        let dir = setup_workspace();
822        let root = dir.path();
823        fs::write(root.join("id_ecdsa"), "-----BEGIN EC PRIVATE KEY-----").unwrap();
824
825        let indexer = DefaultIndexer;
826        let results = indexer.index(root, &IndexerOptions::default()).unwrap();
827        assert!(
828            results.is_empty(),
829            "id_ecdsa must be excluded, got: {:?}",
830            results
831        );
832    }
833
834    #[test]
835    fn test_excludes_tfvars_file() {
836        let dir = setup_workspace();
837        let root = dir.path();
838        fs::write(root.join("terraform.tfvars"), "db_password = \"secret\"").unwrap();
839
840        let indexer = DefaultIndexer;
841        let results = indexer.index(root, &IndexerOptions::default()).unwrap();
842        assert!(
843            results.is_empty(),
844            "terraform.tfvars must be excluded, got: {:?}",
845            results
846        );
847    }
848
849    #[test]
850    fn test_excludes_p12_file() {
851        let dir = setup_workspace();
852        let root = dir.path();
853        fs::write(root.join("keystore.p12"), b"fake pkcs12 binary bytes").unwrap();
854
855        let indexer = DefaultIndexer;
856        let results = indexer.index(root, &IndexerOptions::default()).unwrap();
857        assert!(
858            results.is_empty(),
859            "keystore.p12 must be excluded, got: {:?}",
860            results
861        );
862    }
863
864    #[test]
865    fn test_indexes_crt_file_without_pem_header() {
866        let dir = setup_workspace();
867        let root = dir.path();
868        fs::write(root.join("cert.crt"), "CERTIFICATE DATA WITHOUT PEM HEADER").unwrap();
869
870        let indexer = DefaultIndexer;
871        let results = indexer.index(root, &IndexerOptions::default()).unwrap();
872        let names: Vec<_> = results
873            .iter()
874            .map(|r| r.path.to_string_lossy().to_string())
875            .collect();
876        assert!(
877            names.iter().any(|n| n == "cert.crt"),
878            "cert.crt without a PEM header must be indexed, got: {:?}",
879            names
880        );
881    }
882
883    #[test]
884    fn test_excludes_crt_file_with_pem_header() {
885        let dir = setup_workspace();
886        let root = dir.path();
887        fs::write(
888            root.join("cert.crt"),
889            "-----BEGIN CERTIFICATE-----\nMIIB...",
890        )
891        .unwrap();
892
893        let indexer = DefaultIndexer;
894        let results = indexer.index(root, &IndexerOptions::default()).unwrap();
895        let names: Vec<_> = results
896            .iter()
897            .map(|r| r.path.to_string_lossy().to_string())
898            .collect();
899        assert!(
900            !names.iter().any(|n| n == "cert.crt"),
901            "cert.crt with a PEM header must be excluded, got: {:?}",
902            names
903        );
904    }
905
906    // --- Symlink within root behavior test (Gap 4) ---
907
908    #[test]
909    #[cfg(unix)]
910    fn test_symlink_within_root_indexed_with_follow_symlinks() {
911        // Documents the current behavior: with follow_symlinks=true, symlinks that
912        // resolve within the root are allowed through and indexed. Only symlinks that
913        // escape the root produce a PathTraversal error.
914        let dir = setup_workspace();
915        let root = dir.path();
916
917        // Create a real file inside the root.
918        let real_file = root.join("real.rs");
919        fs::write(&real_file, "fn real() {}").unwrap();
920
921        // Create a symlink inside the root that points to the real file (still within root).
922        let symlink_path = root.join("link_to_real.rs");
923        std::os::unix::fs::symlink(&real_file, &symlink_path).unwrap();
924
925        let indexer = DefaultIndexer;
926        let options = IndexerOptions {
927            follow_symlinks: true,
928            ..Default::default()
929        };
930
931        // With follow_symlinks=true, a within-root symlink must be indexed successfully.
932        let result = indexer.index(root, &options);
933        assert!(
934            result.is_ok(),
935            "with follow_symlinks=true, a symlink inside the root must be indexed (not errored); got: {:?}",
936            result
937        );
938        let files = result.unwrap();
939        let names: Vec<_> = files
940            .iter()
941            .map(|r| r.path.to_string_lossy().to_string())
942            .collect();
943        assert!(
944            names.iter().any(|n| n == "link_to_real.rs"),
945            "the within-root symlink must appear in indexed results; got: {:?}",
946            names
947        );
948    }
949}