Skip to main content

argus_repomap/
walker.rs

1use std::path::{Path, PathBuf};
2
3use argus_core::ArgusError;
4
5/// Maximum file size to process (1 MB).
6const MAX_FILE_SIZE: u64 = 1_048_576;
7
8/// Number of bytes to check for binary detection.
9const BINARY_CHECK_SIZE: usize = 8192;
10
11/// A source file discovered during repository walking.
12///
13/// # Examples
14///
15/// ```
16/// use std::path::PathBuf;
17/// use argus_repomap::walker::{Language, SourceFile};
18///
19/// let file = SourceFile {
20///     path: PathBuf::from("src/main.rs"),
21///     language: Language::Rust,
22///     content: "fn main() {}".to_string(),
23/// };
24/// assert_eq!(file.language, Language::Rust);
25/// ```
26#[derive(Debug, Clone)]
27pub struct SourceFile {
28    /// Path relative to the repository root.
29    pub path: PathBuf,
30    /// Detected programming language.
31    pub language: Language,
32    /// Full file content.
33    pub content: String,
34}
35
36/// Programming language detected from file extension.
37///
38/// # Examples
39///
40/// ```
41/// use argus_repomap::walker::Language;
42///
43/// assert_eq!(Language::from_extension("rs"), Language::Rust);
44/// assert_eq!(Language::from_extension("py"), Language::Python);
45/// assert_eq!(Language::from_extension("java"), Language::Java);
46/// assert_eq!(Language::from_extension("c"), Language::C);
47/// assert_eq!(Language::from_extension("cpp"), Language::Cpp);
48/// assert_eq!(Language::from_extension("rb"), Language::Ruby);
49/// assert_eq!(Language::from_extension("php"), Language::Php);
50/// assert_eq!(Language::from_extension("kt"), Language::Kotlin);
51/// assert_eq!(Language::from_extension("swift"), Language::Swift);
52/// assert_eq!(Language::from_extension("txt"), Language::Unknown);
53/// ```
54#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
55pub enum Language {
56    Rust,
57    Python,
58    TypeScript,
59    JavaScript,
60    Go,
61    Java,
62    C,
63    Cpp,
64    Ruby,
65    Php,
66    Kotlin,
67    Swift,
68    Unknown,
69}
70
71impl Language {
72    /// Detect language from a file extension string (without the dot).
73    pub fn from_extension(ext: &str) -> Self {
74        match ext {
75            "rs" => Language::Rust,
76            "py" => Language::Python,
77            "ts" | "tsx" => Language::TypeScript,
78            "js" | "jsx" => Language::JavaScript,
79            "go" => Language::Go,
80            "java" => Language::Java,
81            "c" | "h" => Language::C,
82            "cpp" | "cc" | "cxx" | "hpp" | "hxx" | "hh" => Language::Cpp,
83            "rb" => Language::Ruby,
84            "php" => Language::Php,
85            "kt" | "kts" => Language::Kotlin,
86            "swift" => Language::Swift,
87            _ => Language::Unknown,
88        }
89    }
90
91    /// Get the tree-sitter language grammar for this language.
92    ///
93    /// Returns `None` for `Language::Unknown`.
94    pub fn tree_sitter_language(&self) -> Option<tree_sitter::Language> {
95        match self {
96            Language::Rust => Some(tree_sitter_rust::LANGUAGE.into()),
97            Language::Python => Some(tree_sitter_python::LANGUAGE.into()),
98            Language::TypeScript => Some(tree_sitter_typescript::LANGUAGE_TYPESCRIPT.into()),
99            Language::JavaScript => Some(tree_sitter_javascript::LANGUAGE.into()),
100            Language::Go => Some(tree_sitter_go::LANGUAGE.into()),
101            Language::Java => Some(tree_sitter_java::LANGUAGE.into()),
102            Language::C => Some(tree_sitter_c::LANGUAGE.into()),
103            Language::Cpp => Some(tree_sitter_cpp::LANGUAGE.into()),
104            Language::Ruby => Some(tree_sitter_ruby::LANGUAGE.into()),
105            Language::Php => Some(tree_sitter_php::LANGUAGE_PHP.into()),
106            Language::Kotlin => Some(tree_sitter_kotlin_ng::LANGUAGE.into()),
107            Language::Swift => Some(tree_sitter_swift::LANGUAGE.into()),
108            Language::Unknown => None,
109        }
110    }
111}
112
113/// Walk a repository, respecting `.gitignore`, returning parseable source files.
114///
115/// Skips binary files, files larger than 1 MB, and files with unknown extensions.
116/// Returned paths are relative to `root`.
117///
118/// # Errors
119///
120/// Returns [`ArgusError::Io`] if the root directory cannot be read.
121///
122/// # Examples
123///
124/// ```no_run
125/// use std::path::Path;
126/// use argus_repomap::walker::walk_repo;
127///
128/// let files = walk_repo(Path::new(".")).unwrap();
129/// for f in &files {
130///     println!("{}: {:?}", f.path.display(), f.language);
131/// }
132/// ```
133pub fn walk_repo(root: &Path) -> Result<Vec<SourceFile>, ArgusError> {
134    let walker = ignore::WalkBuilder::new(root).build();
135    let mut files = Vec::new();
136
137    for entry in walker {
138        let entry = match entry {
139            Ok(e) => e,
140            Err(_) => continue,
141        };
142
143        let Some(file_type) = entry.file_type() else {
144            continue;
145        };
146        if !file_type.is_file() {
147            continue;
148        }
149
150        let path = entry.path();
151
152        // Check file size
153        let metadata = match std::fs::metadata(path) {
154            Ok(m) => m,
155            Err(_) => continue,
156        };
157        if metadata.len() > MAX_FILE_SIZE {
158            continue;
159        }
160
161        // Detect language from extension
162        let ext = match path.extension().and_then(|e| e.to_str()) {
163            Some(e) => e,
164            None => continue,
165        };
166        let language = Language::from_extension(ext);
167        if language == Language::Unknown {
168            continue;
169        }
170
171        // Read content
172        let content = match std::fs::read_to_string(path) {
173            Ok(c) => c,
174            Err(_) => continue,
175        };
176
177        // Check for binary content (null bytes in first 8KB)
178        let check_len = content.len().min(BINARY_CHECK_SIZE);
179        if content.as_bytes()[..check_len].contains(&0) {
180            continue;
181        }
182
183        // Make path relative to root
184        let relative = match path.strip_prefix(root) {
185            Ok(r) => r.to_path_buf(),
186            Err(_) => path.to_path_buf(),
187        };
188
189        files.push(SourceFile {
190            path: relative,
191            language,
192            content,
193        });
194    }
195
196    Ok(files)
197}
198
199#[cfg(test)]
200mod tests {
201    use super::*;
202    use std::fs;
203
204    fn make_temp_repo() -> tempfile::TempDir {
205        let dir = tempfile::tempdir().unwrap();
206        let root = dir.path();
207
208        // Create source files
209        fs::create_dir_all(root.join("src")).unwrap();
210        fs::write(root.join("src/main.rs"), "fn main() {}").unwrap();
211        fs::write(root.join("src/lib.py"), "def hello(): pass").unwrap();
212        fs::write(root.join("src/app.ts"), "function run() {}").unwrap();
213        fs::write(root.join("src/util.js"), "const x = 1;").unwrap();
214        fs::write(root.join("src/main.go"), "package main").unwrap();
215        fs::write(
216            root.join("src/Main.java"),
217            "public class Main { public static void main(String[] args) {} }",
218        )
219        .unwrap();
220        fs::write(root.join("src/hello.c"), "int main() { return 0; }").unwrap();
221        fs::write(root.join("src/hello.cpp"), "int main() { return 0; }").unwrap();
222        fs::write(root.join("src/hello.rb"), "def hello; end").unwrap();
223
224        // Create unknown extension file
225        fs::write(root.join("README.md"), "# Hello").unwrap();
226        fs::write(root.join("data.csv"), "a,b,c").unwrap();
227
228        dir
229    }
230
231    #[test]
232    fn walk_finds_known_language_files() {
233        let dir = make_temp_repo();
234        let files = walk_repo(dir.path()).unwrap();
235
236        assert_eq!(files.len(), 9);
237
238        let languages: Vec<Language> = files.iter().map(|f| f.language).collect();
239        assert!(languages.contains(&Language::Rust));
240        assert!(languages.contains(&Language::Python));
241        assert!(languages.contains(&Language::TypeScript));
242        assert!(languages.contains(&Language::JavaScript));
243        assert!(languages.contains(&Language::Go));
244        assert!(languages.contains(&Language::Java));
245        assert!(languages.contains(&Language::C));
246        assert!(languages.contains(&Language::Cpp));
247        assert!(languages.contains(&Language::Ruby));
248    }
249
250    #[test]
251    fn walk_respects_gitignore() {
252        let dir = make_temp_repo();
253        let root = dir.path();
254
255        // The ignore crate needs a .git dir to recognize .gitignore files
256        fs::create_dir_all(root.join(".git")).unwrap();
257
258        // Create .gitignore that ignores the build dir
259        fs::create_dir_all(root.join("build")).unwrap();
260        fs::write(root.join("build/output.rs"), "fn ignored() {}").unwrap();
261        fs::write(root.join(".gitignore"), "build/\n").unwrap();
262
263        let files = walk_repo(root).unwrap();
264        let paths: Vec<&Path> = files.iter().map(|f| f.path.as_path()).collect();
265        for p in &paths {
266            assert!(
267                !p.starts_with("build"),
268                "gitignored file should be skipped: {}",
269                p.display()
270            );
271        }
272    }
273
274    #[test]
275    fn walk_skips_binary_files() {
276        let dir = tempfile::tempdir().unwrap();
277        let root = dir.path();
278
279        // Create a binary .rs file (contains null bytes)
280        let mut binary_content = b"fn main() { ".to_vec();
281        binary_content.push(0);
282        binary_content.extend_from_slice(b" }");
283        fs::write(root.join("binary.rs"), &binary_content).unwrap();
284
285        // Create a normal .rs file
286        fs::write(root.join("normal.rs"), "fn normal() {}").unwrap();
287
288        let files = walk_repo(root).unwrap();
289        assert_eq!(files.len(), 1);
290        assert_eq!(files[0].path, PathBuf::from("normal.rs"));
291    }
292
293    #[test]
294    fn walk_skips_large_and_unknown_files() {
295        let dir = tempfile::tempdir().unwrap();
296        let root = dir.path();
297
298        // Create a file larger than 1MB
299        let large_content = "x".repeat(1_048_577);
300        fs::write(root.join("huge.rs"), &large_content).unwrap();
301
302        // Create unknown extension
303        fs::write(root.join("data.txt"), "hello").unwrap();
304
305        // Create normal file
306        fs::write(root.join("ok.rs"), "fn ok() {}").unwrap();
307
308        let files = walk_repo(root).unwrap();
309        assert_eq!(files.len(), 1);
310        assert_eq!(files[0].path, PathBuf::from("ok.rs"));
311    }
312}