1use std::path::{Path, PathBuf};
35use std::sync::{mpsc, Arc, Mutex};
36
37use anyhow::Result;
38use ignore::{DirEntry, ParallelVisitor, ParallelVisitorBuilder, WalkBuilder, WalkState};
39
40pub const DEFAULT_MAX_FILE_SIZE: u64 = 1024 * 1024; const FLUSH_THRESHOLD: usize = 32;
51
52#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
60pub enum Language {
61 Rust,
62 TypeScript,
63 JavaScript,
64 Python,
65 Go,
66 Java,
67 C,
68 Cpp,
69 Ruby,
70 Scala,
71 Elixir,
72 Haskell,
73 Unknown,
74}
75
76#[derive(Debug, Clone)]
78pub struct WalkedFile {
79 pub abs_path: PathBuf,
81 pub rel_path: String,
84 pub language: Language,
85 pub size_bytes: u64,
86 pub mtime_secs: u64,
90}
91
92pub struct Walker {
111 root: PathBuf,
112 max_file_size: u64,
113 follow_symlinks: bool,
114}
115
116impl Walker {
117 pub fn new(root: impl Into<PathBuf>) -> Self {
119 Self {
120 root: root.into(),
121 max_file_size: DEFAULT_MAX_FILE_SIZE,
122 follow_symlinks: false,
123 }
124 }
125
126 pub fn max_file_size(mut self, bytes: u64) -> Self {
128 self.max_file_size = bytes;
129 self
130 }
131
132 pub fn follow_symlinks(mut self, yes: bool) -> Self {
134 self.follow_symlinks = yes;
135 self
136 }
137
138 pub fn walk_channel(&self) -> Result<mpsc::Receiver<WalkedFile>> {
146 if !self.root.is_dir() {
147 anyhow::bail!("walk root is not a directory: {}", self.root.display());
148 }
149
150 let (tx, rx) = mpsc::channel::<WalkedFile>();
151
152 let root_arc = Arc::new(self.root.clone());
155 let max_file_size = self.max_file_size;
156 let follow_symlinks = self.follow_symlinks;
157
158 std::thread::spawn(move || {
163 let walk = WalkBuilder::new(root_arc.as_path())
164 .hidden(false)
167 .follow_links(follow_symlinks)
168 .git_ignore(true)
170 .git_global(true)
171 .git_exclude(true)
172 .build_parallel();
173
174 let mut builder = VisitorBuilder {
175 tx: Arc::new(Mutex::new(tx)),
179 root: root_arc,
180 max_file_size,
181 };
182
183 walk.visit(&mut builder);
184 });
188
189 Ok(rx)
190 }
191
192 pub fn walk(&self) -> Result<Vec<WalkedFile>> {
200 let mut files: Vec<WalkedFile> = self.walk_channel()?.into_iter().collect();
201 files.sort_unstable_by(|a, b| a.rel_path.cmp(&b.rel_path));
204 Ok(files)
205 }
206}
207
208struct VisitorBuilder {
218 tx: Arc<Mutex<mpsc::Sender<WalkedFile>>>,
219 root: Arc<PathBuf>,
220 max_file_size: u64,
221}
222
223impl<'s> ParallelVisitorBuilder<'s> for VisitorBuilder {
224 fn build(&mut self) -> Box<dyn ParallelVisitor + 's> {
225 let tx = self
228 .tx
229 .lock()
230 .expect("VisitorBuilder mutex poisoned")
231 .clone();
232 Box::new(FileVisitor {
233 local: Vec::with_capacity(FLUSH_THRESHOLD),
234 tx,
235 root: Arc::clone(&self.root),
236 max_file_size: self.max_file_size,
237 })
238 }
239}
240
241struct FileVisitor {
244 local: Vec<WalkedFile>,
247 tx: mpsc::Sender<WalkedFile>,
248 root: Arc<PathBuf>,
249 max_file_size: u64,
250}
251
252impl FileVisitor {
253 fn flush(&mut self) -> bool {
258 for file in std::mem::take(&mut self.local) {
262 if self.tx.send(file).is_err() {
263 return false;
264 }
265 }
266 true
267 }
268}
269
270impl Drop for FileVisitor {
271 fn drop(&mut self) {
272 self.flush();
274 }
275}
276
277impl ParallelVisitor for FileVisitor {
278 fn visit(&mut self, entry: Result<DirEntry, ignore::Error>) -> WalkState {
279 let entry = match entry {
280 Ok(e) => e,
281 Err(e) => {
282 tracing::warn!("walker: entry error: {e}");
283 return WalkState::Continue;
284 }
285 };
286
287 let file_type = match entry.file_type() {
290 Some(ft) => ft,
291 None => return WalkState::Continue, };
293
294 if !file_type.is_file() {
301 return WalkState::Continue;
302 }
303
304 let path = entry.path();
305
306 if path.components().any(|c| c.as_os_str() == ".git") {
311 return WalkState::Continue;
312 }
313
314 if is_binary_extension(path) {
317 return WalkState::Continue;
318 }
319
320 let meta = match entry.metadata() {
323 Ok(m) => m,
324 Err(e) => {
325 tracing::warn!("walker: cannot read metadata for {}: {e}", path.display());
326 return WalkState::Continue;
327 }
328 };
329 let size_bytes = meta.len();
330 let mtime_secs = meta
331 .modified()
332 .ok()
333 .and_then(|t| t.duration_since(std::time::UNIX_EPOCH).ok())
334 .map(|d| d.as_secs())
335 .unwrap_or(0);
336
337 if size_bytes > self.max_file_size {
338 tracing::debug!(
339 "walker: skipping large file {} ({size_bytes} bytes)",
340 path.display()
341 );
342 return WalkState::Continue;
343 }
344
345 self.local.push(WalkedFile {
346 abs_path: path.to_path_buf(),
347 rel_path: make_rel_path(&self.root, path),
348 language: detect_language(path),
349 size_bytes,
350 mtime_secs,
351 });
352
353 if self.local.len() >= FLUSH_THRESHOLD && !self.flush() {
354 return WalkState::Quit;
355 }
356
357 WalkState::Continue
358 }
359}
360
361fn make_rel_path(root: &Path, abs: &Path) -> String {
365 match abs.strip_prefix(root) {
366 Ok(rel) => rel.to_string_lossy().replace('\\', "/"),
367 Err(_) => {
368 tracing::debug!(
370 "walker: {} is not under root {}; using absolute path",
371 abs.display(),
372 root.display()
373 );
374 abs.to_string_lossy().replace('\\', "/")
375 }
376 }
377}
378
379pub fn detect_language(path: &Path) -> Language {
385 match path.extension().and_then(|e| e.to_str()) {
386 Some("rs") => Language::Rust,
387 Some("ts" | "tsx") => Language::TypeScript,
388 Some("js" | "jsx" | "mjs" | "cjs") => Language::JavaScript,
389 Some("py" | "pyi") => Language::Python,
390 Some("go") => Language::Go,
391 Some("java") => Language::Java,
392 Some("c") => Language::C,
393 Some("h") => Language::C,
396 Some("cpp" | "cc" | "cxx" | "hpp" | "hxx" | "hh") => Language::Cpp,
397 Some("rb") => Language::Ruby,
398 Some("scala" | "sc") => Language::Scala,
399 Some("ex" | "exs") => Language::Elixir,
400 Some("hs" | "lhs") => Language::Haskell,
401 _ => Language::Unknown,
402 }
403}
404
405fn is_binary_extension(path: &Path) -> bool {
413 matches!(
414 path.extension().and_then(|e| e.to_str()),
415 Some(
416 "png" | "jpg" | "jpeg" | "gif" | "ico" | "webp" | "bmp" | "tiff"
418 | "o" | "a" | "so" | "dylib" | "dll" | "exe" | "wasm"
420 | "class" | "jar"
421 | "zip" | "tar" | "gz" | "bz2" | "xz" | "7z"
423 | "mp3" | "mp4" | "wav" | "avi" | "mkv" | "mov"
425 | "ttf" | "woff" | "woff2" | "otf" | "eot"
427 | "lock" | "snap"
429 | "db" | "sqlite" | "sqlite3"
431 | "pdf"
433 )
434 )
435}
436
437#[cfg(test)]
440mod tests {
441 use super::*;
442 use std::fs;
443 use tempfile::TempDir;
444
445 fn write(dir: &Path, rel: &str, content: &str) {
449 let full = dir.join(rel);
450 if let Some(parent) = full.parent() {
451 fs::create_dir_all(parent).unwrap();
452 }
453 fs::write(full, content).unwrap();
454 }
455
456 fn rel_paths(files: &[WalkedFile]) -> Vec<&str> {
458 let mut paths: Vec<&str> = files.iter().map(|f| f.rel_path.as_str()).collect();
459 paths.sort_unstable();
460 paths
461 }
462
463 #[test]
466 fn walk_returns_all_source_files() {
467 let dir = TempDir::new().unwrap();
468 write(dir.path(), "src/main.rs", "fn main() {}");
469 write(dir.path(), "src/lib.py", "def foo(): pass");
470 write(dir.path(), "app/index.ts", "export {}");
471
472 let files = Walker::new(dir.path()).walk().unwrap();
473 let paths = rel_paths(&files);
474
475 assert!(paths.contains(&"app/index.ts"));
476 assert!(paths.contains(&"src/lib.py"));
477 assert!(paths.contains(&"src/main.rs"));
478 assert_eq!(files.len(), 3);
479 }
480
481 #[test]
482 fn walk_output_is_sorted_by_rel_path() {
483 let dir = TempDir::new().unwrap();
484 write(dir.path(), "z.rs", "");
485 write(dir.path(), "a.rs", "");
486 write(dir.path(), "m.rs", "");
487
488 let files = Walker::new(dir.path()).walk().unwrap();
489 let paths: Vec<&str> = files.iter().map(|f| f.rel_path.as_str()).collect();
490
491 assert_eq!(paths, vec!["a.rs", "m.rs", "z.rs"]);
492 }
493
494 #[test]
495 fn walk_empty_dir_returns_empty_vec() {
496 let dir = TempDir::new().unwrap();
497 let files = Walker::new(dir.path()).walk().unwrap();
498 assert!(files.is_empty());
499 }
500
501 #[test]
502 fn walk_nested_dirs_have_correct_rel_path() {
503 let dir = TempDir::new().unwrap();
504 write(dir.path(), "a/b/c/deep.rs", "");
505
506 let files = Walker::new(dir.path()).walk().unwrap();
507 assert_eq!(files.len(), 1);
508 assert_eq!(files[0].rel_path, "a/b/c/deep.rs");
509 }
510
511 #[test]
512 fn walk_rel_path_does_not_start_with_slash() {
513 let dir = TempDir::new().unwrap();
514 write(dir.path(), "src/foo.rs", "");
515
516 let files = Walker::new(dir.path()).walk().unwrap();
517 assert_eq!(files.len(), 1);
518 assert!(!files[0].rel_path.starts_with('/'));
519 }
520
521 #[test]
522 fn walk_respects_gitignore() {
523 let dir = TempDir::new().unwrap();
524 fs::create_dir(dir.path().join(".git")).unwrap();
527 write(dir.path(), ".gitignore", "ignored.rs\ntarget/\n");
528 write(dir.path(), "kept.rs", "");
529 write(dir.path(), "ignored.rs", "");
530 write(dir.path(), "target/debug/binary", "");
531
532 let files = Walker::new(dir.path()).walk().unwrap();
533 let paths = rel_paths(&files);
534
535 assert!(paths.contains(&"kept.rs"));
537 assert!(
538 !paths.contains(&"ignored.rs"),
539 "ignored.rs should be excluded by .gitignore"
540 );
541 assert!(
542 paths.iter().all(|p| !p.starts_with("target/")),
543 "target/ should be excluded by .gitignore"
544 );
545 }
546
547 #[test]
548 fn walk_excludes_files_over_size_limit() {
549 let dir = TempDir::new().unwrap();
550 let big = dir.path().join("big.rs");
551 fs::write(&big, vec![b'x'; 513]).unwrap();
553 write(dir.path(), "small.rs", "fn main() {}");
554
555 let files = Walker::new(dir.path()).max_file_size(512).walk().unwrap();
556
557 let paths = rel_paths(&files);
558 assert!(paths.contains(&"small.rs"));
559 assert!(
560 !paths.contains(&"big.rs"),
561 "big.rs should be excluded by size limit"
562 );
563 }
564
565 #[test]
566 fn walk_includes_file_exactly_at_size_limit() {
567 let dir = TempDir::new().unwrap();
568 let exact = dir.path().join("exact.rs");
569 fs::write(&exact, vec![b'x'; 512]).unwrap();
570
571 let files = Walker::new(dir.path()).max_file_size(512).walk().unwrap();
572
573 assert_eq!(
574 files.len(),
575 1,
576 "file at exact size limit should be included"
577 );
578 }
579
580 #[test]
581 fn walk_excludes_binary_extensions() {
582 let dir = TempDir::new().unwrap();
583 write(dir.path(), "image.png", "not really a png");
584 write(dir.path(), "archive.zip", "not really a zip");
585 write(dir.path(), "lib.so", "");
586 write(dir.path(), "Cargo.lock", "generated");
587 write(dir.path(), "source.rs", "fn main() {}");
588
589 let files = Walker::new(dir.path()).walk().unwrap();
590 let paths = rel_paths(&files);
591
592 assert!(paths.contains(&"source.rs"));
593 assert!(!paths.contains(&"image.png"));
594 assert!(!paths.contains(&"archive.zip"));
595 assert!(!paths.contains(&"lib.so"));
596 assert!(!paths.contains(&"Cargo.lock"));
597 }
598
599 #[test]
600 fn walk_does_not_yield_directories() {
601 let dir = TempDir::new().unwrap();
602 fs::create_dir(dir.path().join("subdir")).unwrap();
603 write(dir.path(), "subdir/file.rs", "");
604
605 let files = Walker::new(dir.path()).walk().unwrap();
606
607 for f in &files {
608 assert!(
609 f.abs_path.is_file(),
610 "walker yielded a directory: {}",
611 f.rel_path
612 );
613 }
614 }
615
616 #[test]
617 fn walk_channel_and_walk_return_same_files() {
618 let dir = TempDir::new().unwrap();
619 write(dir.path(), "a.rs", "");
620 write(dir.path(), "b.py", "");
621 write(dir.path(), "c.ts", "");
622
623 let walker = Walker::new(dir.path());
624
625 let mut channel_paths: Vec<String> = walker
627 .walk_channel()
628 .unwrap()
629 .into_iter()
630 .map(|f| f.rel_path)
631 .collect();
632 channel_paths.sort_unstable();
633
634 let batch_paths: Vec<String> = walker
636 .walk()
637 .unwrap()
638 .into_iter()
639 .map(|f| f.rel_path)
640 .collect();
641
642 assert_eq!(channel_paths, batch_paths);
643 }
644
645 #[test]
646 fn walk_errors_on_nonexistent_root() {
647 let result = Walker::new("/nonexistent/path/that/does/not/exist").walk();
648 assert!(result.is_err());
649 }
650
651 #[test]
652 fn walk_size_bytes_is_accurate() {
653 let dir = TempDir::new().unwrap();
654 let content = "fn main() { println!(\"hello\"); }";
655 write(dir.path(), "main.rs", content);
656
657 let files = Walker::new(dir.path()).walk().unwrap();
658 assert_eq!(files.len(), 1);
659 assert_eq!(files[0].size_bytes, content.len() as u64);
660 }
661
662 #[test]
665 fn detect_language_rust() {
666 assert_eq!(detect_language(Path::new("foo.rs")), Language::Rust);
667 }
668
669 #[test]
670 fn detect_language_typescript() {
671 assert_eq!(detect_language(Path::new("app.ts")), Language::TypeScript);
672 assert_eq!(detect_language(Path::new("comp.tsx")), Language::TypeScript);
673 }
674
675 #[test]
676 fn detect_language_javascript() {
677 assert_eq!(detect_language(Path::new("index.js")), Language::JavaScript);
678 assert_eq!(detect_language(Path::new("mod.mjs")), Language::JavaScript);
679 assert_eq!(detect_language(Path::new("cjs.cjs")), Language::JavaScript);
680 }
681
682 #[test]
683 fn detect_language_python() {
684 assert_eq!(detect_language(Path::new("main.py")), Language::Python);
685 assert_eq!(detect_language(Path::new("types.pyi")), Language::Python);
686 }
687
688 #[test]
689 fn detect_language_go() {
690 assert_eq!(detect_language(Path::new("main.go")), Language::Go);
691 }
692
693 #[test]
694 fn detect_language_java() {
695 assert_eq!(detect_language(Path::new("Main.java")), Language::Java);
696 }
697
698 #[test]
699 fn detect_language_c() {
700 assert_eq!(detect_language(Path::new("main.c")), Language::C);
701 assert_eq!(detect_language(Path::new("header.h")), Language::C);
702 }
703
704 #[test]
705 fn detect_language_cpp() {
706 assert_eq!(detect_language(Path::new("main.cpp")), Language::Cpp);
707 assert_eq!(detect_language(Path::new("util.cc")), Language::Cpp);
708 assert_eq!(detect_language(Path::new("lib.cxx")), Language::Cpp);
709 assert_eq!(detect_language(Path::new("header.hpp")), Language::Cpp);
710 assert_eq!(detect_language(Path::new("tmpl.hxx")), Language::Cpp);
711 assert_eq!(detect_language(Path::new("types.hh")), Language::Cpp);
712 }
713
714 #[test]
715 fn detect_language_ruby() {
716 assert_eq!(detect_language(Path::new("app.rb")), Language::Ruby);
717 }
718
719 #[test]
720 fn detect_language_scala() {
721 assert_eq!(detect_language(Path::new("Main.scala")), Language::Scala);
722 assert_eq!(detect_language(Path::new("script.sc")), Language::Scala);
723 }
724
725 #[test]
726 fn detect_language_elixir() {
727 assert_eq!(detect_language(Path::new("app.ex")), Language::Elixir);
728 assert_eq!(detect_language(Path::new("test.exs")), Language::Elixir);
729 }
730
731 #[test]
732 fn detect_language_haskell() {
733 assert_eq!(detect_language(Path::new("Main.hs")), Language::Haskell);
734 assert_eq!(
735 detect_language(Path::new("Literate.lhs")),
736 Language::Haskell
737 );
738 }
739
740 #[test]
741 fn detect_language_unknown_for_config_and_text() {
742 assert_eq!(detect_language(Path::new("Cargo.toml")), Language::Unknown);
743 assert_eq!(detect_language(Path::new("README.md")), Language::Unknown);
744 assert_eq!(detect_language(Path::new("script.sh")), Language::Unknown);
745 assert_eq!(detect_language(Path::new(".env")), Language::Unknown);
746 assert_eq!(
747 detect_language(Path::new("no_extension")),
748 Language::Unknown
749 );
750 }
751
752 #[test]
755 fn binary_extensions_are_excluded() {
756 let binaries = [
757 "image.png",
758 "photo.jpg",
759 "archive.zip",
760 "lib.so",
761 "binary.exe",
762 "module.wasm",
763 "Cargo.lock",
764 "yarn.lock",
765 "snapshot.snap",
766 "data.db",
767 "doc.pdf",
768 ];
769 for name in binaries {
770 assert!(
771 is_binary_extension(Path::new(name)),
772 "{name} should be detected as binary"
773 );
774 }
775 }
776
777 #[test]
778 fn source_extensions_are_not_binary() {
779 let sources = [
780 "main.rs",
781 "app.py",
782 "index.ts",
783 "main.go",
784 "package.json",
785 "Cargo.toml",
786 "README.md",
787 "style.css",
788 "image.svg",
789 ];
790 for name in sources {
791 assert!(
792 !is_binary_extension(Path::new(name)),
793 "{name} should not be detected as binary"
794 );
795 }
796 }
797}