1use std::path::{Path, PathBuf};
2
3use argus_core::ArgusError;
4
5const MAX_FILE_SIZE: u64 = 1_048_576;
7
8const BINARY_CHECK_SIZE: usize = 8192;
10
11#[derive(Debug, Clone)]
27pub struct SourceFile {
28 pub path: PathBuf,
30 pub language: Language,
32 pub content: String,
34}
35
36#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
55pub enum Language {
56 Rust,
57 Python,
58 TypeScript,
59 JavaScript,
60 Go,
61 Java,
62 C,
63 Cpp,
64 Ruby,
65 Php,
66 Kotlin,
67 Swift,
68 Unknown,
69}
70
71impl Language {
72 pub fn from_extension(ext: &str) -> Self {
74 match ext {
75 "rs" => Language::Rust,
76 "py" => Language::Python,
77 "ts" | "tsx" => Language::TypeScript,
78 "js" | "jsx" => Language::JavaScript,
79 "go" => Language::Go,
80 "java" => Language::Java,
81 "c" | "h" => Language::C,
82 "cpp" | "cc" | "cxx" | "hpp" | "hxx" | "hh" => Language::Cpp,
83 "rb" => Language::Ruby,
84 "php" => Language::Php,
85 "kt" | "kts" => Language::Kotlin,
86 "swift" => Language::Swift,
87 _ => Language::Unknown,
88 }
89 }
90
91 pub fn tree_sitter_language(&self) -> Option<tree_sitter::Language> {
95 match self {
96 Language::Rust => Some(tree_sitter_rust::LANGUAGE.into()),
97 Language::Python => Some(tree_sitter_python::LANGUAGE.into()),
98 Language::TypeScript => Some(tree_sitter_typescript::LANGUAGE_TYPESCRIPT.into()),
99 Language::JavaScript => Some(tree_sitter_javascript::LANGUAGE.into()),
100 Language::Go => Some(tree_sitter_go::LANGUAGE.into()),
101 Language::Java => Some(tree_sitter_java::LANGUAGE.into()),
102 Language::C => Some(tree_sitter_c::LANGUAGE.into()),
103 Language::Cpp => Some(tree_sitter_cpp::LANGUAGE.into()),
104 Language::Ruby => Some(tree_sitter_ruby::LANGUAGE.into()),
105 Language::Php => Some(tree_sitter_php::LANGUAGE_PHP.into()),
106 Language::Kotlin => Some(tree_sitter_kotlin_ng::LANGUAGE.into()),
107 Language::Swift => Some(tree_sitter_swift::LANGUAGE.into()),
108 Language::Unknown => None,
109 }
110 }
111}
112
113pub fn walk_repo(root: &Path) -> Result<Vec<SourceFile>, ArgusError> {
134 let walker = ignore::WalkBuilder::new(root).build();
135 let mut files = Vec::new();
136
137 for entry in walker {
138 let entry = match entry {
139 Ok(e) => e,
140 Err(_) => continue,
141 };
142
143 let Some(file_type) = entry.file_type() else {
144 continue;
145 };
146 if !file_type.is_file() {
147 continue;
148 }
149
150 let path = entry.path();
151
152 let metadata = match std::fs::metadata(path) {
154 Ok(m) => m,
155 Err(_) => continue,
156 };
157 if metadata.len() > MAX_FILE_SIZE {
158 continue;
159 }
160
161 let ext = match path.extension().and_then(|e| e.to_str()) {
163 Some(e) => e,
164 None => continue,
165 };
166 let language = Language::from_extension(ext);
167 if language == Language::Unknown {
168 continue;
169 }
170
171 let content = match std::fs::read_to_string(path) {
173 Ok(c) => c,
174 Err(_) => continue,
175 };
176
177 let check_len = content.len().min(BINARY_CHECK_SIZE);
179 if content.as_bytes()[..check_len].contains(&0) {
180 continue;
181 }
182
183 let relative = match path.strip_prefix(root) {
185 Ok(r) => r.to_path_buf(),
186 Err(_) => path.to_path_buf(),
187 };
188
189 files.push(SourceFile {
190 path: relative,
191 language,
192 content,
193 });
194 }
195
196 Ok(files)
197}
198
199#[cfg(test)]
200mod tests {
201 use super::*;
202 use std::fs;
203
204 fn make_temp_repo() -> tempfile::TempDir {
205 let dir = tempfile::tempdir().unwrap();
206 let root = dir.path();
207
208 fs::create_dir_all(root.join("src")).unwrap();
210 fs::write(root.join("src/main.rs"), "fn main() {}").unwrap();
211 fs::write(root.join("src/lib.py"), "def hello(): pass").unwrap();
212 fs::write(root.join("src/app.ts"), "function run() {}").unwrap();
213 fs::write(root.join("src/util.js"), "const x = 1;").unwrap();
214 fs::write(root.join("src/main.go"), "package main").unwrap();
215 fs::write(
216 root.join("src/Main.java"),
217 "public class Main { public static void main(String[] args) {} }",
218 )
219 .unwrap();
220 fs::write(root.join("src/hello.c"), "int main() { return 0; }").unwrap();
221 fs::write(root.join("src/hello.cpp"), "int main() { return 0; }").unwrap();
222 fs::write(root.join("src/hello.rb"), "def hello; end").unwrap();
223
224 fs::write(root.join("README.md"), "# Hello").unwrap();
226 fs::write(root.join("data.csv"), "a,b,c").unwrap();
227
228 dir
229 }
230
231 #[test]
232 fn walk_finds_known_language_files() {
233 let dir = make_temp_repo();
234 let files = walk_repo(dir.path()).unwrap();
235
236 assert_eq!(files.len(), 9);
237
238 let languages: Vec<Language> = files.iter().map(|f| f.language).collect();
239 assert!(languages.contains(&Language::Rust));
240 assert!(languages.contains(&Language::Python));
241 assert!(languages.contains(&Language::TypeScript));
242 assert!(languages.contains(&Language::JavaScript));
243 assert!(languages.contains(&Language::Go));
244 assert!(languages.contains(&Language::Java));
245 assert!(languages.contains(&Language::C));
246 assert!(languages.contains(&Language::Cpp));
247 assert!(languages.contains(&Language::Ruby));
248 }
249
250 #[test]
251 fn walk_respects_gitignore() {
252 let dir = make_temp_repo();
253 let root = dir.path();
254
255 fs::create_dir_all(root.join(".git")).unwrap();
257
258 fs::create_dir_all(root.join("build")).unwrap();
260 fs::write(root.join("build/output.rs"), "fn ignored() {}").unwrap();
261 fs::write(root.join(".gitignore"), "build/\n").unwrap();
262
263 let files = walk_repo(root).unwrap();
264 let paths: Vec<&Path> = files.iter().map(|f| f.path.as_path()).collect();
265 for p in &paths {
266 assert!(
267 !p.starts_with("build"),
268 "gitignored file should be skipped: {}",
269 p.display()
270 );
271 }
272 }
273
274 #[test]
275 fn walk_skips_binary_files() {
276 let dir = tempfile::tempdir().unwrap();
277 let root = dir.path();
278
279 let mut binary_content = b"fn main() { ".to_vec();
281 binary_content.push(0);
282 binary_content.extend_from_slice(b" }");
283 fs::write(root.join("binary.rs"), &binary_content).unwrap();
284
285 fs::write(root.join("normal.rs"), "fn normal() {}").unwrap();
287
288 let files = walk_repo(root).unwrap();
289 assert_eq!(files.len(), 1);
290 assert_eq!(files[0].path, PathBuf::from("normal.rs"));
291 }
292
293 #[test]
294 fn walk_skips_large_and_unknown_files() {
295 let dir = tempfile::tempdir().unwrap();
296 let root = dir.path();
297
298 let large_content = "x".repeat(1_048_577);
300 fs::write(root.join("huge.rs"), &large_content).unwrap();
301
302 fs::write(root.join("data.txt"), "hello").unwrap();
304
305 fs::write(root.join("ok.rs"), "fn ok() {}").unwrap();
307
308 let files = walk_repo(root).unwrap();
309 assert_eq!(files.len(), 1);
310 assert_eq!(files[0].path, PathBuf::from("ok.rs"));
311 }
312}