1use ahash::AHashMap;
4use ignore::WalkBuilder;
5use std::collections::HashSet;
6use std::path::{Path, PathBuf};
7use std::sync::LazyLock;
8
9#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
11pub enum FileCategory {
12 Code,
13 Document,
14}
15
16#[derive(Debug, Clone, Copy)]
17pub struct FileType {
18 pub language: &'static str,
19 pub category: FileCategory,
20}
21
22pub static FILE_TYPES: &[(&str, FileType)] = &[
24 (
25 ".py",
26 FileType {
27 language: "python",
28 category: FileCategory::Code,
29 },
30 ),
31 (
32 ".js",
33 FileType {
34 language: "javascript",
35 category: FileCategory::Code,
36 },
37 ),
38 (
39 ".jsx",
40 FileType {
41 language: "javascript",
42 category: FileCategory::Code,
43 },
44 ),
45 (
46 ".ts",
47 FileType {
48 language: "typescript",
49 category: FileCategory::Code,
50 },
51 ),
52 (
53 ".tsx",
54 FileType {
55 language: "typescript",
56 category: FileCategory::Code,
57 },
58 ),
59 (
60 ".go",
61 FileType {
62 language: "go",
63 category: FileCategory::Code,
64 },
65 ),
66 (
67 ".rs",
68 FileType {
69 language: "rust",
70 category: FileCategory::Code,
71 },
72 ),
73 (
74 ".java",
75 FileType {
76 language: "java",
77 category: FileCategory::Code,
78 },
79 ),
80 (
81 ".kt",
82 FileType {
83 language: "kotlin",
84 category: FileCategory::Code,
85 },
86 ),
87 (
88 ".kts",
89 FileType {
90 language: "kotlin",
91 category: FileCategory::Code,
92 },
93 ),
94 (
95 ".rb",
96 FileType {
97 language: "ruby",
98 category: FileCategory::Code,
99 },
100 ),
101 (
102 ".php",
103 FileType {
104 language: "php",
105 category: FileCategory::Code,
106 },
107 ),
108 (
109 ".c",
110 FileType {
111 language: "c",
112 category: FileCategory::Code,
113 },
114 ),
115 (
116 ".h",
117 FileType {
118 language: "c",
119 category: FileCategory::Code,
120 },
121 ),
122 (
123 ".cpp",
124 FileType {
125 language: "cpp",
126 category: FileCategory::Code,
127 },
128 ),
129 (
130 ".hpp",
131 FileType {
132 language: "cpp",
133 category: FileCategory::Code,
134 },
135 ),
136 (
137 ".cs",
138 FileType {
139 language: "csharp",
140 category: FileCategory::Code,
141 },
142 ),
143 (
144 ".swift",
145 FileType {
146 language: "swift",
147 category: FileCategory::Code,
148 },
149 ),
150 (
151 ".scala",
152 FileType {
153 language: "scala",
154 category: FileCategory::Code,
155 },
156 ),
157 (
158 ".sbt",
159 FileType {
160 language: "scala",
161 category: FileCategory::Code,
162 },
163 ),
164 (
165 ".ex",
166 FileType {
167 language: "elixir",
168 category: FileCategory::Code,
169 },
170 ),
171 (
172 ".exs",
173 FileType {
174 language: "elixir",
175 category: FileCategory::Code,
176 },
177 ),
178 (
179 ".dart",
180 FileType {
181 language: "dart",
182 category: FileCategory::Code,
183 },
184 ),
185 (
186 ".lua",
187 FileType {
188 language: "lua",
189 category: FileCategory::Code,
190 },
191 ),
192 (
193 ".sql",
194 FileType {
195 language: "sql",
196 category: FileCategory::Code,
197 },
198 ),
199 (
200 ".sh",
201 FileType {
202 language: "bash",
203 category: FileCategory::Code,
204 },
205 ),
206 (
207 ".bash",
208 FileType {
209 language: "bash",
210 category: FileCategory::Code,
211 },
212 ),
213 (
214 ".zig",
215 FileType {
216 language: "zig",
217 category: FileCategory::Code,
218 },
219 ),
220 (
221 ".hs",
222 FileType {
223 language: "haskell",
224 category: FileCategory::Code,
225 },
226 ),
227 (
229 ".md",
230 FileType {
231 language: "markdown",
232 category: FileCategory::Document,
233 },
234 ),
235 (
236 ".yaml",
237 FileType {
238 language: "yaml",
239 category: FileCategory::Document,
240 },
241 ),
242 (
243 ".yml",
244 FileType {
245 language: "yaml",
246 category: FileCategory::Document,
247 },
248 ),
249 (
250 ".toml",
251 FileType {
252 language: "toml",
253 category: FileCategory::Document,
254 },
255 ),
256 (
257 ".json",
258 FileType {
259 language: "json",
260 category: FileCategory::Document,
261 },
262 ),
263];
264
265pub static DEFAULT_IGNORED_DIRS: &[&str] = &[
267 ".git",
268 ".hg",
269 ".svn",
270 "__pycache__",
271 "node_modules",
272 ".venv",
273 "venv",
274 ".tox",
275 ".mypy_cache",
276 ".pytest_cache",
277 ".ruff_cache",
278 ".cache",
279 ".veles",
280 "dist",
281 "build",
282 ".eggs",
283 "target",
284 ".cargo",
285 ".next",
286 ".nuxt",
287];
288
289static EXT_LANG_MAP: LazyLock<AHashMap<&'static str, &'static str>> = LazyLock::new(|| {
291 FILE_TYPES
292 .iter()
293 .map(|(ext, ft)| {
294 let trimmed = ext.strip_prefix('.').unwrap_or(*ext);
296 (trimmed, ft.language)
297 })
298 .collect()
299});
300
301pub fn language_for_path(path: &Path) -> Option<&'static str> {
303 let ext = path.extension()?.to_str()?;
304 if ext
305 .bytes()
306 .all(|b| b.is_ascii_lowercase() || b.is_ascii_digit())
307 {
308 EXT_LANG_MAP.get(ext).copied()
309 } else {
310 let lower = ext.to_ascii_lowercase();
312 EXT_LANG_MAP.get(lower.as_str()).copied()
313 }
314}
315
316static CODE_EXTENSIONS: LazyLock<HashSet<String>> = LazyLock::new(|| {
318 FILE_TYPES
319 .iter()
320 .filter(|(_, ft)| ft.category == FileCategory::Code)
321 .map(|(ext, _)| (*ext).to_string())
322 .collect()
323});
324
325static CODE_AND_DOC_EXTENSIONS: LazyLock<HashSet<String>> = LazyLock::new(|| {
327 FILE_TYPES
328 .iter()
329 .map(|(ext, _)| (*ext).to_string())
330 .collect()
331});
332
333pub fn filter_extensions(
342 extensions: Option<&HashSet<String>>,
343 include_text_files: bool,
344) -> HashSet<String> {
345 if let Some(exts) = extensions {
346 return exts.clone();
347 }
348 if include_text_files {
349 CODE_AND_DOC_EXTENSIONS.clone()
350 } else {
351 CODE_EXTENSIONS.clone()
352 }
353}
354
355const MAX_FILE_BYTES: u64 = 1_000_000;
357
358pub fn walk_files<'a>(
363 root: &'a Path,
364 extensions: &'a HashSet<String>,
365) -> impl Iterator<Item = PathBuf> + 'a {
366 let mut builder = WalkBuilder::new(root);
367 builder
368 .hidden(true) .git_ignore(true) .git_global(true) .git_exclude(true) .build()
373 .filter_map(move |entry| {
374 let entry = entry.ok()?;
375 if !entry.file_type()?.is_file() {
376 return None;
377 }
378 let path = entry.path();
379
380 let ext = path.extension()?.to_str()?;
382 let matched = if ext
383 .bytes()
384 .all(|b| b.is_ascii_lowercase() || b.is_ascii_digit())
385 {
386 let mut buf = [0u8; 16];
388 let n = ext.len() + 1;
389 if n > buf.len() {
390 return None;
391 }
392 buf[0] = b'.';
393 buf[1..n].copy_from_slice(ext.as_bytes());
394 let s = std::str::from_utf8(&buf[..n]).ok()?;
395 extensions.contains(s)
396 } else {
397 let lower = ext.to_ascii_lowercase();
398 let ext_with_dot = format!(".{lower}");
399 extensions.contains(&ext_with_dot)
400 };
401 if !matched {
402 return None;
403 }
404
405 if let Ok(metadata) = entry.metadata()
408 && metadata.len() > MAX_FILE_BYTES
409 {
410 return None;
411 }
412 Some(path.to_path_buf())
413 })
414}
415
416#[cfg(test)]
417mod tests {
418 use super::*;
419
420 #[test]
421 fn test_language_for_path() {
422 assert_eq!(language_for_path(Path::new("main.rs")), Some("rust"));
423 assert_eq!(language_for_path(Path::new("app.py")), Some("python"));
424 assert_eq!(language_for_path(Path::new("readme.md")), Some("markdown"));
425 assert_eq!(language_for_path(Path::new("Makefile")), None);
426 }
427
428 #[test]
429 fn test_filter_extensions_code_only() {
430 let exts = filter_extensions(None, false);
431 assert!(exts.contains(".rs"));
432 assert!(exts.contains(".py"));
433 assert!(!exts.contains(".md"));
434 }
435
436 #[test]
437 fn test_filter_extensions_with_text() {
438 let exts = filter_extensions(None, true);
439 assert!(exts.contains(".rs"));
440 assert!(exts.contains(".md"));
441 assert!(exts.contains(".json"));
442 }
443}