1use std::path::{Path, PathBuf};
6use thiserror::Error;
7
8#[derive(Debug, Error)]
10pub enum FilterError {
11 #[error("failed to read file for heuristics: {path}")]
12 ReadFailed { path: PathBuf },
13}
14
15#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
17pub enum Language {
18 Rust,
19 TypeScript,
20 Tsx,
21 JavaScript,
22 Jsx,
23 Python,
24 Go,
25}
26
27impl std::fmt::Display for Language {
28 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
29 match self {
30 Language::Rust => write!(f, "rust"),
31 Language::TypeScript => write!(f, "typescript"),
32 Language::Tsx => write!(f, "tsx"),
33 Language::JavaScript => write!(f, "javascript"),
34 Language::Jsx => write!(f, "jsx"),
35 Language::Python => write!(f, "python"),
36 Language::Go => write!(f, "go"),
37 }
38 }
39}
40
41impl std::str::FromStr for Language {
42 type Err = String;
43
44 fn from_str(s: &str) -> Result<Self, Self::Err> {
45 match s.to_lowercase().as_str() {
46 "rust" | "rs" => Ok(Language::Rust),
47 "typescript" | "ts" => Ok(Language::TypeScript),
48 "tsx" => Ok(Language::Tsx),
49 "javascript" | "js" => Ok(Language::JavaScript),
50 "jsx" => Ok(Language::Jsx),
51 "python" | "py" => Ok(Language::Python),
52 "go" => Ok(Language::Go),
53 _ => Err(format!("unknown language: {}", s)),
54 }
55 }
56}
57
58impl Language {
59 pub fn all() -> &'static [Language] {
61 &[
62 Language::Rust,
63 Language::TypeScript,
64 Language::Tsx,
65 Language::JavaScript,
66 Language::Jsx,
67 Language::Python,
68 Language::Go,
69 ]
70 }
71
72 pub fn extensions(&self) -> &'static [&'static str] {
74 match self {
75 Language::Rust => &["rs"],
76 Language::TypeScript => &["ts"],
77 Language::Tsx => &["tsx"],
78 Language::JavaScript => &["js", "mjs", "cjs"],
79 Language::Jsx => &["jsx"],
80 Language::Python => &["py", "pyi"],
81 Language::Go => &["go"],
82 }
83 }
84}
85
86#[derive(Debug, Clone, PartialEq, Eq)]
88pub enum FilterResult {
89 Accept(Language),
91 Reject(RejectReason),
93}
94
95#[derive(Debug, Clone, PartialEq, Eq)]
97pub enum RejectReason {
98 BlocklistedExtension,
100 UnknownExtension,
102 NoExtension,
104 BinaryContent,
106 MinifiedContent,
108 GeneratedFile,
110}
111
112impl std::fmt::Display for RejectReason {
113 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
114 match self {
115 RejectReason::BlocklistedExtension => write!(f, "blocklisted extension"),
116 RejectReason::UnknownExtension => write!(f, "unknown extension"),
117 RejectReason::NoExtension => write!(f, "no extension"),
118 RejectReason::BinaryContent => write!(f, "binary content"),
119 RejectReason::MinifiedContent => write!(f, "minified content"),
120 RejectReason::GeneratedFile => write!(f, "generated file"),
121 }
122 }
123}
124
125const BLOCKLISTED_EXTENSIONS: &[&str] = &[
127 "png", "jpg", "jpeg", "gif", "webp", "ico", "svg", "bmp", "tiff",
129 "wasm", "so", "dll", "dylib", "exe", "bin", "o", "a", "lib",
131 "zip", "tar", "gz", "bz2", "xz", "7z", "rar", "tgz",
133 "pdf", "doc", "docx", "xls", "xlsx", "ppt", "pptx",
135 "mp3", "mp4", "wav", "avi", "mov", "mkv", "flac", "ogg",
137 "ttf", "otf", "woff", "woff2", "eot",
139 "lock",
141 "map",
143 "db", "sqlite", "sqlite3",
145];
146
147const BLOCKLISTED_FILENAMES: &[&str] = &[
149 "package-lock.json",
150 "yarn.lock",
151 "pnpm-lock.yaml",
152 "Cargo.lock",
153 "poetry.lock",
154 "Gemfile.lock",
155 "composer.lock",
156];
157
158const GENERATED_MARKERS: &[&str] = &[
160 "// Code generated",
161 "// DO NOT EDIT",
162 "# Generated by",
163 "/* Auto-generated */",
164 "// This file is auto-generated",
165 "@generated",
166 "// generated from",
167 "// Autogenerated",
168 "# Autogenerated",
169 "# DO NOT EDIT",
170 "<!-- Generated -->",
171 "// THIS FILE IS GENERATED",
172];
173
174const MAX_LINE_LENGTH: usize = 500;
176
177pub fn is_blocklisted(extension: &str) -> bool {
179 let ext_lower = extension.to_lowercase();
180 BLOCKLISTED_EXTENSIONS.contains(&ext_lower.as_str())
181}
182
183pub fn is_blocklisted_filename(filename: &str) -> bool {
185 BLOCKLISTED_FILENAMES.contains(&filename)
186}
187
188pub fn detect_language(path: &Path) -> Option<Language> {
190 let ext = path.extension()?.to_str()?.to_lowercase();
191
192 for lang in Language::all() {
193 if lang.extensions().contains(&ext.as_str()) {
194 return Some(*lang);
195 }
196 }
197 None
198}
199
200pub fn is_binary(content: &[u8]) -> bool {
202 content.contains(&0)
203}
204
205pub fn is_minified(content: &[u8]) -> bool {
208 let mut line_len = 0;
209 for &b in content {
210 if b == b'\n' {
211 line_len = 0;
212 } else {
213 line_len += 1;
214 if line_len > MAX_LINE_LENGTH {
215 return true; }
217 }
218 }
219 false
220}
221
222pub fn is_generated(content: &[u8]) -> bool {
224 let check_len = content.len().min(2048);
226 let Ok(text) = std::str::from_utf8(&content[..check_len]) else {
227 return false; };
229
230 GENERATED_MARKERS
231 .iter()
232 .any(|marker| text.contains(marker))
233}
234
235pub fn should_process(path: &Path, content: Option<&[u8]>) -> FilterResult {
264 if let Some(filename) = path.file_name().and_then(|f| f.to_str()) {
266 if is_blocklisted_filename(filename) {
267 return FilterResult::Reject(RejectReason::BlocklistedExtension);
268 }
269 }
270
271 let ext = match path.extension().and_then(|e| e.to_str()) {
273 Some(e) => e.to_lowercase(),
274 None => return FilterResult::Reject(RejectReason::NoExtension),
275 };
276
277 if is_blocklisted(&ext) {
279 return FilterResult::Reject(RejectReason::BlocklistedExtension);
280 }
281
282 if let Some(stem) = path.file_stem().and_then(|s| s.to_str()) {
284 if stem.to_lowercase().ends_with(".min") {
285 return FilterResult::Reject(RejectReason::MinifiedContent);
286 }
287 }
288
289 let Some(language) = detect_language(path) else {
291 return FilterResult::Reject(RejectReason::UnknownExtension);
292 };
293
294 if let Some(content) = content {
296 if is_binary(content) {
297 return FilterResult::Reject(RejectReason::BinaryContent);
298 }
299
300 if is_minified(content) {
301 return FilterResult::Reject(RejectReason::MinifiedContent);
302 }
303
304 if is_generated(content) {
305 return FilterResult::Reject(RejectReason::GeneratedFile);
306 }
307 }
308
309 FilterResult::Accept(language)
310}
311
312pub fn passes_extension_filter(path: &Path) -> Option<Language> {
316 match should_process(path, None) {
317 FilterResult::Accept(lang) => Some(lang),
318 FilterResult::Reject(_) => None,
319 }
320}
321
322#[cfg(test)]
323mod tests {
324 use super::*;
325
326 #[test]
327 fn test_rust_file() {
328 let result = should_process(Path::new("src/main.rs"), None);
329 assert_eq!(result, FilterResult::Accept(Language::Rust));
330 }
331
332 #[test]
333 fn test_typescript_file() {
334 let result = should_process(Path::new("src/index.ts"), None);
335 assert_eq!(result, FilterResult::Accept(Language::TypeScript));
336 }
337
338 #[test]
339 fn test_tsx_file() {
340 let result = should_process(Path::new("components/App.tsx"), None);
341 assert_eq!(result, FilterResult::Accept(Language::Tsx));
342 }
343
344 #[test]
345 fn test_javascript_file() {
346 let result = should_process(Path::new("lib/utils.js"), None);
347 assert_eq!(result, FilterResult::Accept(Language::JavaScript));
348
349 let result = should_process(Path::new("lib/utils.mjs"), None);
350 assert_eq!(result, FilterResult::Accept(Language::JavaScript));
351 }
352
353 #[test]
354 fn test_python_file() {
355 let result = should_process(Path::new("script.py"), None);
356 assert_eq!(result, FilterResult::Accept(Language::Python));
357
358 let result = should_process(Path::new("types.pyi"), None);
359 assert_eq!(result, FilterResult::Accept(Language::Python));
360 }
361
362 #[test]
363 fn test_go_file() {
364 let result = should_process(Path::new("main.go"), None);
365 assert_eq!(result, FilterResult::Accept(Language::Go));
366 }
367
368 #[test]
369 fn test_blocklisted_extension() {
370 let result = should_process(Path::new("image.png"), None);
371 assert_eq!(
372 result,
373 FilterResult::Reject(RejectReason::BlocklistedExtension)
374 );
375
376 let result = should_process(Path::new("archive.zip"), None);
377 assert_eq!(
378 result,
379 FilterResult::Reject(RejectReason::BlocklistedExtension)
380 );
381 }
382
383 #[test]
384 fn test_blocklisted_filename() {
385 let result = should_process(Path::new("package-lock.json"), None);
386 assert_eq!(
387 result,
388 FilterResult::Reject(RejectReason::BlocklistedExtension)
389 );
390
391 let result = should_process(Path::new("Cargo.lock"), None);
392 assert_eq!(
393 result,
394 FilterResult::Reject(RejectReason::BlocklistedExtension)
395 );
396 }
397
398 #[test]
399 fn test_unknown_extension() {
400 let result = should_process(Path::new("README.md"), None);
401 assert_eq!(result, FilterResult::Reject(RejectReason::UnknownExtension));
402
403 let result = should_process(Path::new("config.yaml"), None);
404 assert_eq!(result, FilterResult::Reject(RejectReason::UnknownExtension));
405 }
406
407 #[test]
408 fn test_no_extension() {
409 let result = should_process(Path::new("Makefile"), None);
410 assert_eq!(result, FilterResult::Reject(RejectReason::NoExtension));
411 }
412
413 #[test]
414 fn test_minified_filename() {
415 let result = should_process(Path::new("bundle.min.js"), None);
416 assert_eq!(result, FilterResult::Reject(RejectReason::MinifiedContent));
417 }
418
419 #[test]
420 fn test_binary_content() {
421 let content = b"fn main() {\x00}";
422 let result = should_process(Path::new("test.rs"), Some(content));
423 assert_eq!(result, FilterResult::Reject(RejectReason::BinaryContent));
424 }
425
426 #[test]
427 fn test_minified_content() {
428 let long_line = "x".repeat(MAX_LINE_LENGTH + 100);
430 let content = format!("var x = {{\n{}\n}}", long_line);
431 let result = should_process(Path::new("bundle.js"), Some(content.as_bytes()));
432 assert_eq!(result, FilterResult::Reject(RejectReason::MinifiedContent));
433 }
434
435 #[test]
436 fn test_generated_content() {
437 let content = b"// Code generated by protoc. DO NOT EDIT.\npackage main";
438 let result = should_process(Path::new("proto.go"), Some(content));
439 assert_eq!(result, FilterResult::Reject(RejectReason::GeneratedFile));
440 }
441
442 #[test]
443 fn test_valid_content() {
444 let content = b"fn main() {\n println!(\"Hello\");\n}";
445 let result = should_process(Path::new("main.rs"), Some(content));
446 assert_eq!(result, FilterResult::Accept(Language::Rust));
447 }
448
449 #[test]
450 fn test_language_extensions() {
451 assert_eq!(Language::Rust.extensions(), &["rs"]);
452 assert_eq!(Language::JavaScript.extensions(), &["js", "mjs", "cjs"]);
453 assert_eq!(Language::Python.extensions(), &["py", "pyi"]);
454 }
455
456 #[test]
457 fn test_language_from_str() {
458 assert_eq!("rust".parse::<Language>().unwrap(), Language::Rust);
459 assert_eq!("rs".parse::<Language>().unwrap(), Language::Rust);
460 assert_eq!("typescript".parse::<Language>().unwrap(), Language::TypeScript);
461 assert_eq!("ts".parse::<Language>().unwrap(), Language::TypeScript);
462 assert!("invalid".parse::<Language>().is_err());
463 }
464
465 #[test]
466 fn test_passes_extension_filter() {
467 assert_eq!(
468 passes_extension_filter(Path::new("main.rs")),
469 Some(Language::Rust)
470 );
471 assert_eq!(passes_extension_filter(Path::new("image.png")), None);
472 }
473}