Skip to main content

mati_core/analysis/
walker.rs

1//! Parallel file walker for Layer 0 static analysis.
2//!
3//! Uses `ignore::WalkParallel` (same engine as ripgrep) for parallel,
4//! gitignore-aware directory traversal. Results stream to the caller via an
5//! `mpsc` channel so downstream parsing can start before the walk completes.
6//!
7//! # Architecture
8//!
9//! ```text
10//! Walker::walk_channel()
11//!     │
12//!     ├── spawns std::thread (WalkParallel::visit is blocking/sync)
13//!     │       │
14//!     │       ├── VisitorBuilder::build() — one FileVisitor per worker thread
15//!     │       │
16//!     │       └── FileVisitor::visit() — per-entry filtering + local buffering
17//!     │               │
18//!     │               └── flush every FLUSH_THRESHOLD entries → mpsc::Sender
19//!     │                   Drop flush handles tail entries
20//!     │
21//!     └── returns mpsc::Receiver<WalkedFile>  (parser consumes while walk runs)
22//!
23//! Walker::walk() — thin wrapper: collect channel → sort → Vec<WalkedFile>
24//! ```
25//!
26//! ## Why thread-local buffering?
27//!
28//! `mpsc::Sender::send()` acquires an internal lock on every call. With 8
29//! threads and 80k files, 80k individual sends ≈ 16ms of contention overhead.
30//! Flushing every `FLUSH_THRESHOLD` entries reduces sends to ~2 500,
31//! cutting that overhead to ~500µs while still giving the receiver batches
32//! early enough for meaningful parse pipelining.
33
34use std::path::{Path, PathBuf};
35use std::sync::{mpsc, Arc, Mutex};
36
37use anyhow::Result;
38use ignore::{DirEntry, ParallelVisitor, ParallelVisitorBuilder, WalkBuilder, WalkState};
39
40// ── Constants ─────────────────────────────────────────────────────────────────
41
42/// Default maximum file size accepted by the walker (bytes).
43/// Files larger than this are silently skipped — they are almost always
44/// generated artefacts (minified JS, compiled output) not worth parsing.
45pub const DEFAULT_MAX_FILE_SIZE: u64 = 1024 * 1024; // 1 MiB
46
47/// Number of [`WalkedFile`] entries a [`FileVisitor`] accumulates locally
48/// before flushing to the shared channel. Balances streaming latency against
49/// `mpsc` lock contention on large repos.
50const FLUSH_THRESHOLD: usize = 32;
51
52// ── Public types ──────────────────────────────────────────────────────────────
53
54/// Programming language detected from file extension.
55///
56/// All variants with a corresponding tree-sitter grammar are explicitly named.
57/// Everything else — config files, markdown, shell scripts, etc. — maps to
58/// [`Language::Unknown`] and is still walked but not parsed by tree-sitter.
59#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
60pub enum Language {
61    Rust,
62    TypeScript,
63    JavaScript,
64    Python,
65    Go,
66    Java,
67    C,
68    Cpp,
69    Ruby,
70    Scala,
71    Elixir,
72    Haskell,
73    Unknown,
74}
75
76/// A single file discovered by the walker.
77#[derive(Debug, Clone)]
78pub struct WalkedFile {
79    /// Absolute path — used for opening the file for parsing.
80    pub abs_path: PathBuf,
81    /// Repo-relative path with forward slashes — used as the mati store key
82    /// suffix: `file:<rel_path>`.
83    pub rel_path: String,
84    pub language: Language,
85    pub size_bytes: u64,
86    /// File modification time as seconds since Unix epoch (0 if unavailable).
87    /// Used as a cheap pre-filter in incremental init: if mtime matches the
88    /// stored value, skip disk read + parse entirely.
89    pub mtime_secs: u64,
90}
91
92// ── Walker ────────────────────────────────────────────────────────────────────
93
94/// Parallel, gitignore-aware file walker.
95///
96/// # Example
97/// ```no_run
98/// use mati_core::analysis::Walker;
99///
100/// let walker = Walker::new("/path/to/repo");
101///
102/// // Streaming — parser can consume while walk is still in progress.
103/// for file in walker.walk_channel().unwrap() {
104///     println!("{} ({:?})", file.rel_path, file.language);
105/// }
106///
107/// // Batch — sorted Vec, useful when you need the full set before proceeding.
108/// let files = walker.walk().unwrap();
109/// ```
110pub struct Walker {
111    root: PathBuf,
112    max_file_size: u64,
113    follow_symlinks: bool,
114}
115
116impl Walker {
117    /// Create a walker rooted at `root` with default settings.
118    pub fn new(root: impl Into<PathBuf>) -> Self {
119        Self {
120            root: root.into(),
121            max_file_size: DEFAULT_MAX_FILE_SIZE,
122            follow_symlinks: false,
123        }
124    }
125
126    /// Override the maximum file size. Files larger than `bytes` are skipped.
127    pub fn max_file_size(mut self, bytes: u64) -> Self {
128        self.max_file_size = bytes;
129        self
130    }
131
132    /// Whether to follow symbolic links. Default: `false` (avoids cycles).
133    pub fn follow_symlinks(mut self, yes: bool) -> Self {
134        self.follow_symlinks = yes;
135        self
136    }
137
138    /// Primary interface: start the walk and return a channel receiver.
139    ///
140    /// The walk runs on a background thread; the caller can begin consuming
141    /// [`WalkedFile`] items immediately while traversal is still in progress.
142    /// The channel closes automatically when the walk finishes.
143    ///
144    /// Returns `Err` if `root` is not an accessible directory.
145    pub fn walk_channel(&self) -> Result<mpsc::Receiver<WalkedFile>> {
146        if !self.root.is_dir() {
147            anyhow::bail!("walk root is not a directory: {}", self.root.display());
148        }
149
150        let (tx, rx) = mpsc::channel::<WalkedFile>();
151
152        // One clone, used for both WalkBuilder::new and as the root reference
153        // passed to every FileVisitor via VisitorBuilder.
154        let root_arc = Arc::new(self.root.clone());
155        let max_file_size = self.max_file_size;
156        let follow_symlinks = self.follow_symlinks;
157
158        // Spawn a dedicated thread: WalkParallel::visit is blocking and spawns
159        // its own worker threads internally. We must not block the async
160        // runtime (tokio) — always call walk_channel from a spawn_blocking
161        // context when used from async code.
162        std::thread::spawn(move || {
163            let walk = WalkBuilder::new(root_arc.as_path())
164                // Include hidden files — .gitignore is the authority on what
165                // to skip; hiding .github/, .claude/ etc. would lose coverage.
166                .hidden(false)
167                .follow_links(follow_symlinks)
168                // All git-related ignore rules enabled (default, stated for clarity).
169                .git_ignore(true)
170                .git_global(true)
171                .git_exclude(true)
172                .build_parallel();
173
174            let mut builder = VisitorBuilder {
175                // Arc<Mutex<Sender>> satisfies the Send + Sync bound required
176                // by ParallelVisitorBuilder. Each FileVisitor clones the
177                // Sender out of the Mutex exactly once in build().
178                tx: Arc::new(Mutex::new(tx)),
179                root: root_arc,
180                max_file_size,
181            };
182
183            walk.visit(&mut builder);
184            // builder drops here → Arc<Mutex<Sender>> drops → all Sender
185            // clones held by FileVisitors have already been dropped when their
186            // threads finished → channel closes → receiver exhausts cleanly.
187        });
188
189        Ok(rx)
190    }
191
192    /// Batch interface: collect the full walk into a sorted `Vec`.
193    ///
194    /// Useful for callers that need the complete file list before proceeding
195    /// (tests, dep parsing, one-shot reporting). Prefer [`walk_channel`] when
196    /// results will be piped into a parallel processing stage.
197    ///
198    /// [`walk_channel`]: Walker::walk_channel
199    pub fn walk(&self) -> Result<Vec<WalkedFile>> {
200        let mut files: Vec<WalkedFile> = self.walk_channel()?.into_iter().collect();
201        // Deterministic order: sort by repo-relative path so downstream
202        // consumers (store writes, tests) produce repeatable output.
203        files.sort_unstable_by(|a, b| a.rel_path.cmp(&b.rel_path));
204        Ok(files)
205    }
206}
207
208// ── Internal visitor types ────────────────────────────────────────────────────
209
210/// Builds a [`FileVisitor`] for each worker thread spawned by `WalkParallel`.
211///
212/// Must implement `Send + Sync`:
213/// - `Arc<Mutex<mpsc::Sender<_>>>`: Send (Arc<T>: Send when T: Send+Sync) +
214///   Sync (Mutex<T>: Sync when T: Send, mpsc::Sender<T>: Send) ✓
215/// - `Arc<PathBuf>`: Send + Sync ✓
216/// - `u64`: Send + Sync ✓
217struct VisitorBuilder {
218    tx: Arc<Mutex<mpsc::Sender<WalkedFile>>>,
219    root: Arc<PathBuf>,
220    max_file_size: u64,
221}
222
223impl<'s> ParallelVisitorBuilder<'s> for VisitorBuilder {
224    fn build(&mut self) -> Box<dyn ParallelVisitor + 's> {
225        // Clone the Sender once per thread. The Mutex is held only for the
226        // duration of clone() — essentially free.
227        let tx = self
228            .tx
229            .lock()
230            .expect("VisitorBuilder mutex poisoned")
231            .clone();
232        Box::new(FileVisitor {
233            local: Vec::with_capacity(FLUSH_THRESHOLD),
234            tx,
235            root: Arc::clone(&self.root),
236            max_file_size: self.max_file_size,
237        })
238    }
239}
240
241/// Per-thread visitor. Accumulates entries locally and flushes in batches to
242/// reduce `mpsc` lock contention on high-file-count repos.
243struct FileVisitor {
244    /// Thread-local accumulator — flushed every FLUSH_THRESHOLD entries and
245    /// on Drop (tail flush for the final partial batch).
246    local: Vec<WalkedFile>,
247    tx: mpsc::Sender<WalkedFile>,
248    root: Arc<PathBuf>,
249    max_file_size: u64,
250}
251
252impl FileVisitor {
253    /// Send all buffered entries to the channel.
254    ///
255    /// Returns `false` if the receiver was dropped — the caller should return
256    /// [`WalkState::Quit`] to stop the walk early.
257    fn flush(&mut self) -> bool {
258        // mem::take swaps self.local with an empty Vec, giving us owned
259        // iteration without holding a borrow on self.local. Any remaining
260        // items are dropped when `batch` goes out of scope.
261        for file in std::mem::take(&mut self.local) {
262            if self.tx.send(file).is_err() {
263                return false;
264            }
265        }
266        true
267    }
268}
269
270impl Drop for FileVisitor {
271    fn drop(&mut self) {
272        // Tail flush: send any entries accumulated since the last threshold flush.
273        self.flush();
274    }
275}
276
277impl ParallelVisitor for FileVisitor {
278    fn visit(&mut self, entry: Result<DirEntry, ignore::Error>) -> WalkState {
279        let entry = match entry {
280            Ok(e) => e,
281            Err(e) => {
282                tracing::warn!("walker: entry error: {e}");
283                return WalkState::Continue;
284            }
285        };
286
287        // DirEntry::file_type() is free on Linux (returned by readdir).
288        // On macOS it may require a stat; ignore handles the caching.
289        let file_type = match entry.file_type() {
290            Some(ft) => ft,
291            None => return WalkState::Continue, // stdin / unknown — skip
292        };
293
294        // Only process regular files. This explicitly skips:
295        //   • directories (walk nodes — the ignore crate descends them)
296        //   • symlinks — even with follow_links(false), ignore still yields
297        //     symlink entries; opening a symlink path follows it implicitly,
298        //     which violates the follow_symlinks=false contract
299        //   • device files, pipes, sockets
300        if !file_type.is_file() {
301            return WalkState::Continue;
302        }
303
304        let path = entry.path();
305
306        // Skip .git/ internals. .hidden(false) is needed to include .github/,
307        // .claude/, etc., but the .git directory itself contains no project
308        // knowledge — only git object/ref data. Check via path components so
309        // we don't accidentally skip a legitimate ".git"-named user directory.
310        if path.components().any(|c| c.as_os_str() == ".git") {
311            return WalkState::Continue;
312        }
313
314        // Extension-based binary filter: checked before metadata() to avoid
315        // unnecessary syscalls on clearly unanalysable files.
316        if is_binary_extension(path) {
317            return WalkState::Continue;
318        }
319
320        // DirEntry::metadata() reuses cached data where available (inode
321        // info from readdir on Linux). Unavoidable for size filtering.
322        let meta = match entry.metadata() {
323            Ok(m) => m,
324            Err(e) => {
325                tracing::warn!("walker: cannot read metadata for {}: {e}", path.display());
326                return WalkState::Continue;
327            }
328        };
329        let size_bytes = meta.len();
330        let mtime_secs = meta
331            .modified()
332            .ok()
333            .and_then(|t| t.duration_since(std::time::UNIX_EPOCH).ok())
334            .map(|d| d.as_secs())
335            .unwrap_or(0);
336
337        if size_bytes > self.max_file_size {
338            tracing::debug!(
339                "walker: skipping large file {} ({size_bytes} bytes)",
340                path.display()
341            );
342            return WalkState::Continue;
343        }
344
345        self.local.push(WalkedFile {
346            abs_path: path.to_path_buf(),
347            rel_path: make_rel_path(&self.root, path),
348            language: detect_language(path),
349            size_bytes,
350            mtime_secs,
351        });
352
353        if self.local.len() >= FLUSH_THRESHOLD && !self.flush() {
354            return WalkState::Quit;
355        }
356
357        WalkState::Continue
358    }
359}
360
361// ── Helper functions ──────────────────────────────────────────────────────────
362
363/// Compute a forward-slash repo-relative path for use as the mati store key.
364fn make_rel_path(root: &Path, abs: &Path) -> String {
365    match abs.strip_prefix(root) {
366        Ok(rel) => rel.to_string_lossy().replace('\\', "/"),
367        Err(_) => {
368            // Should never happen — all entries come from walking root.
369            tracing::debug!(
370                "walker: {} is not under root {}; using absolute path",
371                abs.display(),
372                root.display()
373            );
374            abs.to_string_lossy().replace('\\', "/")
375        }
376    }
377}
378
379/// Detect programming language from file extension.
380///
381/// Only languages with a tree-sitter grammar in this project are explicitly
382/// matched. Config files, markdown, shell scripts, etc. return
383/// [`Language::Unknown`] — they are still walked and stored but not parsed.
384pub fn detect_language(path: &Path) -> Language {
385    match path.extension().and_then(|e| e.to_str()) {
386        Some("rs") => Language::Rust,
387        Some("ts" | "tsx") => Language::TypeScript,
388        Some("js" | "jsx" | "mjs" | "cjs") => Language::JavaScript,
389        Some("py" | "pyi") => Language::Python,
390        Some("go") => Language::Go,
391        Some("java") => Language::Java,
392        Some("c") => Language::C,
393        // .h is ambiguous (C vs C++ vs ObjC). Defaults to C — C++ headers
394        // typically use .hpp/.hxx/.hh. This is a known, accepted heuristic.
395        Some("h") => Language::C,
396        Some("cpp" | "cc" | "cxx" | "hpp" | "hxx" | "hh") => Language::Cpp,
397        Some("rb") => Language::Ruby,
398        Some("scala" | "sc") => Language::Scala,
399        Some("ex" | "exs") => Language::Elixir,
400        Some("hs" | "lhs") => Language::Haskell,
401        _ => Language::Unknown,
402    }
403}
404
405/// Return `true` for extensions that indicate binary or generated files that
406/// are never useful for tree-sitter analysis or mati knowledge records.
407///
408/// `.svg` is intentionally excluded from this list — it is XML text and may
409/// appear in documentation or assets that are relevant to know about.
410/// `.json` is also excluded — `package.json`, `tsconfig.json` etc. are
411/// valuable for dependency analysis (M-06-E).
412fn is_binary_extension(path: &Path) -> bool {
413    matches!(
414        path.extension().and_then(|e| e.to_str()),
415        Some(
416            // Raster images
417            "png" | "jpg" | "jpeg" | "gif" | "ico" | "webp" | "bmp" | "tiff"
418            // Compiled / native artefacts
419            | "o" | "a" | "so" | "dylib" | "dll" | "exe" | "wasm"
420            | "class" | "jar"
421            // Archives
422            | "zip" | "tar" | "gz" | "bz2" | "xz" | "7z"
423            // Media
424            | "mp3" | "mp4" | "wav" | "avi" | "mkv" | "mov"
425            // Fonts
426            | "ttf" | "woff" | "woff2" | "otf" | "eot"
427            // Generated lock / snapshot files — large, not useful for analysis
428            | "lock" | "snap"
429            // Databases
430            | "db" | "sqlite" | "sqlite3"
431            // Documents
432            | "pdf"
433        )
434    )
435}
436
437// ── Tests ─────────────────────────────────────────────────────────────────────
438
439#[cfg(test)]
440mod tests {
441    use super::*;
442    use std::fs;
443    use tempfile::TempDir;
444
445    // ── Helpers ───────────────────────────────────────────────────────────────
446
447    /// Write `content` to `dir/path`, creating intermediate directories.
448    fn write(dir: &Path, rel: &str, content: &str) {
449        let full = dir.join(rel);
450        if let Some(parent) = full.parent() {
451            fs::create_dir_all(parent).unwrap();
452        }
453        fs::write(full, content).unwrap();
454    }
455
456    /// Collect rel_paths from a walk result, sorted.
457    fn rel_paths(files: &[WalkedFile]) -> Vec<&str> {
458        let mut paths: Vec<&str> = files.iter().map(|f| f.rel_path.as_str()).collect();
459        paths.sort_unstable();
460        paths
461    }
462
463    // ── Walker behaviour ──────────────────────────────────────────────────────
464
465    #[test]
466    fn walk_returns_all_source_files() {
467        let dir = TempDir::new().unwrap();
468        write(dir.path(), "src/main.rs", "fn main() {}");
469        write(dir.path(), "src/lib.py", "def foo(): pass");
470        write(dir.path(), "app/index.ts", "export {}");
471
472        let files = Walker::new(dir.path()).walk().unwrap();
473        let paths = rel_paths(&files);
474
475        assert!(paths.contains(&"app/index.ts"));
476        assert!(paths.contains(&"src/lib.py"));
477        assert!(paths.contains(&"src/main.rs"));
478        assert_eq!(files.len(), 3);
479    }
480
481    #[test]
482    fn walk_output_is_sorted_by_rel_path() {
483        let dir = TempDir::new().unwrap();
484        write(dir.path(), "z.rs", "");
485        write(dir.path(), "a.rs", "");
486        write(dir.path(), "m.rs", "");
487
488        let files = Walker::new(dir.path()).walk().unwrap();
489        let paths: Vec<&str> = files.iter().map(|f| f.rel_path.as_str()).collect();
490
491        assert_eq!(paths, vec!["a.rs", "m.rs", "z.rs"]);
492    }
493
494    #[test]
495    fn walk_empty_dir_returns_empty_vec() {
496        let dir = TempDir::new().unwrap();
497        let files = Walker::new(dir.path()).walk().unwrap();
498        assert!(files.is_empty());
499    }
500
501    #[test]
502    fn walk_nested_dirs_have_correct_rel_path() {
503        let dir = TempDir::new().unwrap();
504        write(dir.path(), "a/b/c/deep.rs", "");
505
506        let files = Walker::new(dir.path()).walk().unwrap();
507        assert_eq!(files.len(), 1);
508        assert_eq!(files[0].rel_path, "a/b/c/deep.rs");
509    }
510
511    #[test]
512    fn walk_rel_path_does_not_start_with_slash() {
513        let dir = TempDir::new().unwrap();
514        write(dir.path(), "src/foo.rs", "");
515
516        let files = Walker::new(dir.path()).walk().unwrap();
517        assert_eq!(files.len(), 1);
518        assert!(!files[0].rel_path.starts_with('/'));
519    }
520
521    #[test]
522    fn walk_respects_gitignore() {
523        let dir = TempDir::new().unwrap();
524        // ignore crate only reads .gitignore when it detects a git root.
525        // A .git directory (even empty) is sufficient for detection.
526        fs::create_dir(dir.path().join(".git")).unwrap();
527        write(dir.path(), ".gitignore", "ignored.rs\ntarget/\n");
528        write(dir.path(), "kept.rs", "");
529        write(dir.path(), "ignored.rs", "");
530        write(dir.path(), "target/debug/binary", "");
531
532        let files = Walker::new(dir.path()).walk().unwrap();
533        let paths = rel_paths(&files);
534
535        // .gitignore itself is included (it's a text file)
536        assert!(paths.contains(&"kept.rs"));
537        assert!(
538            !paths.contains(&"ignored.rs"),
539            "ignored.rs should be excluded by .gitignore"
540        );
541        assert!(
542            paths.iter().all(|p| !p.starts_with("target/")),
543            "target/ should be excluded by .gitignore"
544        );
545    }
546
547    #[test]
548    fn walk_excludes_files_over_size_limit() {
549        let dir = TempDir::new().unwrap();
550        let big = dir.path().join("big.rs");
551        // Write exactly max_file_size + 1 bytes
552        fs::write(&big, vec![b'x'; 513]).unwrap();
553        write(dir.path(), "small.rs", "fn main() {}");
554
555        let files = Walker::new(dir.path()).max_file_size(512).walk().unwrap();
556
557        let paths = rel_paths(&files);
558        assert!(paths.contains(&"small.rs"));
559        assert!(
560            !paths.contains(&"big.rs"),
561            "big.rs should be excluded by size limit"
562        );
563    }
564
565    #[test]
566    fn walk_includes_file_exactly_at_size_limit() {
567        let dir = TempDir::new().unwrap();
568        let exact = dir.path().join("exact.rs");
569        fs::write(&exact, vec![b'x'; 512]).unwrap();
570
571        let files = Walker::new(dir.path()).max_file_size(512).walk().unwrap();
572
573        assert_eq!(
574            files.len(),
575            1,
576            "file at exact size limit should be included"
577        );
578    }
579
580    #[test]
581    fn walk_excludes_binary_extensions() {
582        let dir = TempDir::new().unwrap();
583        write(dir.path(), "image.png", "not really a png");
584        write(dir.path(), "archive.zip", "not really a zip");
585        write(dir.path(), "lib.so", "");
586        write(dir.path(), "Cargo.lock", "generated");
587        write(dir.path(), "source.rs", "fn main() {}");
588
589        let files = Walker::new(dir.path()).walk().unwrap();
590        let paths = rel_paths(&files);
591
592        assert!(paths.contains(&"source.rs"));
593        assert!(!paths.contains(&"image.png"));
594        assert!(!paths.contains(&"archive.zip"));
595        assert!(!paths.contains(&"lib.so"));
596        assert!(!paths.contains(&"Cargo.lock"));
597    }
598
599    #[test]
600    fn walk_does_not_yield_directories() {
601        let dir = TempDir::new().unwrap();
602        fs::create_dir(dir.path().join("subdir")).unwrap();
603        write(dir.path(), "subdir/file.rs", "");
604
605        let files = Walker::new(dir.path()).walk().unwrap();
606
607        for f in &files {
608            assert!(
609                f.abs_path.is_file(),
610                "walker yielded a directory: {}",
611                f.rel_path
612            );
613        }
614    }
615
616    #[test]
617    fn walk_channel_and_walk_return_same_files() {
618        let dir = TempDir::new().unwrap();
619        write(dir.path(), "a.rs", "");
620        write(dir.path(), "b.py", "");
621        write(dir.path(), "c.ts", "");
622
623        let walker = Walker::new(dir.path());
624
625        // Collect channel output (unordered)
626        let mut channel_paths: Vec<String> = walker
627            .walk_channel()
628            .unwrap()
629            .into_iter()
630            .map(|f| f.rel_path)
631            .collect();
632        channel_paths.sort_unstable();
633
634        // Batch walk (sorted)
635        let batch_paths: Vec<String> = walker
636            .walk()
637            .unwrap()
638            .into_iter()
639            .map(|f| f.rel_path)
640            .collect();
641
642        assert_eq!(channel_paths, batch_paths);
643    }
644
645    #[test]
646    fn walk_errors_on_nonexistent_root() {
647        let result = Walker::new("/nonexistent/path/that/does/not/exist").walk();
648        assert!(result.is_err());
649    }
650
651    #[test]
652    fn walk_size_bytes_is_accurate() {
653        let dir = TempDir::new().unwrap();
654        let content = "fn main() { println!(\"hello\"); }";
655        write(dir.path(), "main.rs", content);
656
657        let files = Walker::new(dir.path()).walk().unwrap();
658        assert_eq!(files.len(), 1);
659        assert_eq!(files[0].size_bytes, content.len() as u64);
660    }
661
662    // ── detect_language ───────────────────────────────────────────────────────
663
664    #[test]
665    fn detect_language_rust() {
666        assert_eq!(detect_language(Path::new("foo.rs")), Language::Rust);
667    }
668
669    #[test]
670    fn detect_language_typescript() {
671        assert_eq!(detect_language(Path::new("app.ts")), Language::TypeScript);
672        assert_eq!(detect_language(Path::new("comp.tsx")), Language::TypeScript);
673    }
674
675    #[test]
676    fn detect_language_javascript() {
677        assert_eq!(detect_language(Path::new("index.js")), Language::JavaScript);
678        assert_eq!(detect_language(Path::new("mod.mjs")), Language::JavaScript);
679        assert_eq!(detect_language(Path::new("cjs.cjs")), Language::JavaScript);
680    }
681
682    #[test]
683    fn detect_language_python() {
684        assert_eq!(detect_language(Path::new("main.py")), Language::Python);
685        assert_eq!(detect_language(Path::new("types.pyi")), Language::Python);
686    }
687
688    #[test]
689    fn detect_language_go() {
690        assert_eq!(detect_language(Path::new("main.go")), Language::Go);
691    }
692
693    #[test]
694    fn detect_language_java() {
695        assert_eq!(detect_language(Path::new("Main.java")), Language::Java);
696    }
697
698    #[test]
699    fn detect_language_c() {
700        assert_eq!(detect_language(Path::new("main.c")), Language::C);
701        assert_eq!(detect_language(Path::new("header.h")), Language::C);
702    }
703
704    #[test]
705    fn detect_language_cpp() {
706        assert_eq!(detect_language(Path::new("main.cpp")), Language::Cpp);
707        assert_eq!(detect_language(Path::new("util.cc")), Language::Cpp);
708        assert_eq!(detect_language(Path::new("lib.cxx")), Language::Cpp);
709        assert_eq!(detect_language(Path::new("header.hpp")), Language::Cpp);
710        assert_eq!(detect_language(Path::new("tmpl.hxx")), Language::Cpp);
711        assert_eq!(detect_language(Path::new("types.hh")), Language::Cpp);
712    }
713
714    #[test]
715    fn detect_language_ruby() {
716        assert_eq!(detect_language(Path::new("app.rb")), Language::Ruby);
717    }
718
719    #[test]
720    fn detect_language_scala() {
721        assert_eq!(detect_language(Path::new("Main.scala")), Language::Scala);
722        assert_eq!(detect_language(Path::new("script.sc")), Language::Scala);
723    }
724
725    #[test]
726    fn detect_language_elixir() {
727        assert_eq!(detect_language(Path::new("app.ex")), Language::Elixir);
728        assert_eq!(detect_language(Path::new("test.exs")), Language::Elixir);
729    }
730
731    #[test]
732    fn detect_language_haskell() {
733        assert_eq!(detect_language(Path::new("Main.hs")), Language::Haskell);
734        assert_eq!(
735            detect_language(Path::new("Literate.lhs")),
736            Language::Haskell
737        );
738    }
739
740    #[test]
741    fn detect_language_unknown_for_config_and_text() {
742        assert_eq!(detect_language(Path::new("Cargo.toml")), Language::Unknown);
743        assert_eq!(detect_language(Path::new("README.md")), Language::Unknown);
744        assert_eq!(detect_language(Path::new("script.sh")), Language::Unknown);
745        assert_eq!(detect_language(Path::new(".env")), Language::Unknown);
746        assert_eq!(
747            detect_language(Path::new("no_extension")),
748            Language::Unknown
749        );
750    }
751
752    // ── is_binary_extension ───────────────────────────────────────────────────
753
754    #[test]
755    fn binary_extensions_are_excluded() {
756        let binaries = [
757            "image.png",
758            "photo.jpg",
759            "archive.zip",
760            "lib.so",
761            "binary.exe",
762            "module.wasm",
763            "Cargo.lock",
764            "yarn.lock",
765            "snapshot.snap",
766            "data.db",
767            "doc.pdf",
768        ];
769        for name in binaries {
770            assert!(
771                is_binary_extension(Path::new(name)),
772                "{name} should be detected as binary"
773            );
774        }
775    }
776
777    #[test]
778    fn source_extensions_are_not_binary() {
779        let sources = [
780            "main.rs",
781            "app.py",
782            "index.ts",
783            "main.go",
784            "package.json",
785            "Cargo.toml",
786            "README.md",
787            "style.css",
788            "image.svg",
789        ];
790        for name in sources {
791            assert!(
792                !is_binary_extension(Path::new(name)),
793                "{name} should not be detected as binary"
794            );
795        }
796    }
797}