1use ahash::AHashMap;
4use ignore::WalkBuilder;
5use std::collections::HashSet;
6use std::path::{Path, PathBuf};
7use std::sync::LazyLock;
8
9#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
11pub enum FileCategory {
12 Code,
13 Document,
14}
15
16#[derive(Debug, Clone, Copy)]
17pub struct FileType {
18 pub language: &'static str,
19 pub category: FileCategory,
20}
21
22pub static FILE_TYPES: &[(&str, FileType)] = &[
24 (
25 ".py",
26 FileType {
27 language: "python",
28 category: FileCategory::Code,
29 },
30 ),
31 (
32 ".js",
33 FileType {
34 language: "javascript",
35 category: FileCategory::Code,
36 },
37 ),
38 (
39 ".jsx",
40 FileType {
41 language: "javascript",
42 category: FileCategory::Code,
43 },
44 ),
45 (
46 ".ts",
47 FileType {
48 language: "typescript",
49 category: FileCategory::Code,
50 },
51 ),
52 (
53 ".tsx",
54 FileType {
55 language: "typescript",
56 category: FileCategory::Code,
57 },
58 ),
59 (
60 ".go",
61 FileType {
62 language: "go",
63 category: FileCategory::Code,
64 },
65 ),
66 (
67 ".rs",
68 FileType {
69 language: "rust",
70 category: FileCategory::Code,
71 },
72 ),
73 (
74 ".java",
75 FileType {
76 language: "java",
77 category: FileCategory::Code,
78 },
79 ),
80 (
81 ".kt",
82 FileType {
83 language: "kotlin",
84 category: FileCategory::Code,
85 },
86 ),
87 (
88 ".kts",
89 FileType {
90 language: "kotlin",
91 category: FileCategory::Code,
92 },
93 ),
94 (
95 ".rb",
96 FileType {
97 language: "ruby",
98 category: FileCategory::Code,
99 },
100 ),
101 (
102 ".php",
103 FileType {
104 language: "php",
105 category: FileCategory::Code,
106 },
107 ),
108 (
109 ".c",
110 FileType {
111 language: "c",
112 category: FileCategory::Code,
113 },
114 ),
115 (
116 ".h",
117 FileType {
118 language: "c",
119 category: FileCategory::Code,
120 },
121 ),
122 (
123 ".cpp",
124 FileType {
125 language: "cpp",
126 category: FileCategory::Code,
127 },
128 ),
129 (
130 ".hpp",
131 FileType {
132 language: "cpp",
133 category: FileCategory::Code,
134 },
135 ),
136 (
137 ".cs",
138 FileType {
139 language: "csharp",
140 category: FileCategory::Code,
141 },
142 ),
143 (
144 ".swift",
145 FileType {
146 language: "swift",
147 category: FileCategory::Code,
148 },
149 ),
150 (
151 ".scala",
152 FileType {
153 language: "scala",
154 category: FileCategory::Code,
155 },
156 ),
157 (
158 ".sbt",
159 FileType {
160 language: "scala",
161 category: FileCategory::Code,
162 },
163 ),
164 (
165 ".ex",
166 FileType {
167 language: "elixir",
168 category: FileCategory::Code,
169 },
170 ),
171 (
172 ".exs",
173 FileType {
174 language: "elixir",
175 category: FileCategory::Code,
176 },
177 ),
178 (
179 ".dart",
180 FileType {
181 language: "dart",
182 category: FileCategory::Code,
183 },
184 ),
185 (
186 ".lua",
187 FileType {
188 language: "lua",
189 category: FileCategory::Code,
190 },
191 ),
192 (
193 ".sql",
194 FileType {
195 language: "sql",
196 category: FileCategory::Code,
197 },
198 ),
199 (
200 ".sh",
201 FileType {
202 language: "bash",
203 category: FileCategory::Code,
204 },
205 ),
206 (
207 ".bash",
208 FileType {
209 language: "bash",
210 category: FileCategory::Code,
211 },
212 ),
213 (
214 ".zig",
215 FileType {
216 language: "zig",
217 category: FileCategory::Code,
218 },
219 ),
220 (
221 ".hs",
222 FileType {
223 language: "haskell",
224 category: FileCategory::Code,
225 },
226 ),
227 (
229 ".md",
230 FileType {
231 language: "markdown",
232 category: FileCategory::Document,
233 },
234 ),
235 (
236 ".yaml",
237 FileType {
238 language: "yaml",
239 category: FileCategory::Document,
240 },
241 ),
242 (
243 ".yml",
244 FileType {
245 language: "yaml",
246 category: FileCategory::Document,
247 },
248 ),
249 (
250 ".toml",
251 FileType {
252 language: "toml",
253 category: FileCategory::Document,
254 },
255 ),
256 (
257 ".json",
258 FileType {
259 language: "json",
260 category: FileCategory::Document,
261 },
262 ),
263];
264
265pub static DEFAULT_IGNORED_DIRS: &[&str] = &[
267 ".git",
268 ".hg",
269 ".svn",
270 "__pycache__",
271 "node_modules",
272 ".venv",
273 "venv",
274 ".tox",
275 ".mypy_cache",
276 ".pytest_cache",
277 ".ruff_cache",
278 ".cache",
279 ".veles",
280 "dist",
281 "build",
282 ".eggs",
283 "target",
284 ".cargo",
285 ".next",
286 ".nuxt",
287];
288
289static EXT_LANG_MAP: LazyLock<AHashMap<&'static str, &'static str>> = LazyLock::new(|| {
291 FILE_TYPES
292 .iter()
293 .map(|(ext, ft)| {
294 let trimmed = ext.strip_prefix('.').unwrap_or(*ext);
296 (trimmed, ft.language)
297 })
298 .collect()
299});
300
301pub fn language_for_path(path: &Path) -> Option<&'static str> {
303 let ext = path.extension()?.to_str()?;
304 if ext
305 .bytes()
306 .all(|b| b.is_ascii_lowercase() || b.is_ascii_digit())
307 {
308 EXT_LANG_MAP.get(ext).copied()
309 } else {
310 let lower = ext.to_ascii_lowercase();
312 EXT_LANG_MAP.get(lower.as_str()).copied()
313 }
314}
315
316pub fn filter_extensions(
318 extensions: Option<&HashSet<String>>,
319 include_text_files: bool,
320) -> HashSet<String> {
321 if let Some(exts) = extensions {
322 return exts.clone();
323 }
324 let categories = if include_text_files {
325 HashSet::from([FileCategory::Code, FileCategory::Document])
326 } else {
327 HashSet::from([FileCategory::Code])
328 };
329 FILE_TYPES
330 .iter()
331 .filter(|(_, ft)| categories.contains(&ft.category))
332 .map(|(ext, _)| ext.to_string())
333 .collect()
334}
335
336const MAX_FILE_BYTES: u64 = 1_000_000;
338
339pub fn walk_files<'a>(
344 root: &'a Path,
345 extensions: &'a HashSet<String>,
346) -> impl Iterator<Item = PathBuf> + 'a {
347 let mut builder = WalkBuilder::new(root);
348 builder
349 .hidden(true) .git_ignore(true) .git_global(true) .git_exclude(true) .build()
354 .filter_map(move |entry| {
355 let entry = entry.ok()?;
356 if !entry.file_type()?.is_file() {
357 return None;
358 }
359 let path = entry.path();
360
361 let ext = path.extension()?.to_str()?;
363 let matched = if ext
364 .bytes()
365 .all(|b| b.is_ascii_lowercase() || b.is_ascii_digit())
366 {
367 let mut buf = [0u8; 16];
369 let n = ext.len() + 1;
370 if n > buf.len() {
371 return None;
372 }
373 buf[0] = b'.';
374 buf[1..n].copy_from_slice(ext.as_bytes());
375 let s = std::str::from_utf8(&buf[..n]).ok()?;
376 extensions.contains(s)
377 } else {
378 let lower = ext.to_ascii_lowercase();
379 let ext_with_dot = format!(".{lower}");
380 extensions.contains(&ext_with_dot)
381 };
382 if !matched {
383 return None;
384 }
385
386 if let Ok(metadata) = entry.metadata()
389 && metadata.len() > MAX_FILE_BYTES
390 {
391 return None;
392 }
393 Some(path.to_path_buf())
394 })
395}
396
397#[cfg(test)]
398mod tests {
399 use super::*;
400
401 #[test]
402 fn test_language_for_path() {
403 assert_eq!(language_for_path(Path::new("main.rs")), Some("rust"));
404 assert_eq!(language_for_path(Path::new("app.py")), Some("python"));
405 assert_eq!(language_for_path(Path::new("readme.md")), Some("markdown"));
406 assert_eq!(language_for_path(Path::new("Makefile")), None);
407 }
408
409 #[test]
410 fn test_filter_extensions_code_only() {
411 let exts = filter_extensions(None, false);
412 assert!(exts.contains(".rs"));
413 assert!(exts.contains(".py"));
414 assert!(!exts.contains(".md"));
415 }
416
417 #[test]
418 fn test_filter_extensions_with_text() {
419 let exts = filter_extensions(None, true);
420 assert!(exts.contains(".rs"));
421 assert!(exts.contains(".md"));
422 assert!(exts.contains(".json"));
423 }
424}