1use fxhash::FxHashSet;
8use memchr::memmem;
9use once_cell::sync::Lazy;
10use std::collections::HashSet;
11use std::path::{Path, PathBuf};
12
13static COLD_EXTENSIONS: Lazy<FxHashSet<&'static str>> = Lazy::new(|| {
15 [
16 "md", "txt", "rst", "adoc", "wiki", "png", "jpg", "jpeg", "gif", "bmp", "ico", "svg", "webp", "tiff", "mp3", "mp4", "avi",
19 "mkv", "mov", "wmv", "flv", "webm", "m4v", "wav", "flac", "ogg", "aac", "wma",
20 "zip", "tar", "gz", "bz2", "xz", "7z", "rar", "jar", "war", "ear",
22 "exe", "dll", "so", "dylib", "a", "lib", "bin", "out", "pdf", "doc", "docx", "xls", "xlsx", "ppt", "pptx", "odt", "ods", "odp",
25 "ttf", "otf", "woff", "woff2", "eot", "tmp", "temp", "cache", "log", "bak", "swp", "swo", "min.js", "min.css",
29 ]
30 .into_iter()
31 .collect()
32});
33
34static HOT_EXTENSIONS: Lazy<FxHashSet<&'static str>> = Lazy::new(|| {
36 [
37 "rs",
39 "py",
40 "js",
41 "ts",
42 "jsx",
43 "tsx",
44 "go",
45 "java",
46 "c",
47 "cpp",
48 "h",
49 "hpp",
50 "cs",
51 "php",
52 "rb",
53 "swift",
54 "kt",
55 "scala",
56 "clj",
57 "hs",
58 "elm",
59 "ml",
60 "ocaml",
61 "json",
63 "yaml",
64 "yml",
65 "toml",
66 "xml",
67 "html",
68 "css",
69 "scss",
70 "less",
71 "sass",
72 "sh",
74 "bash",
75 "zsh",
76 "fish",
77 "ps1",
78 "cmd",
79 "bat",
80 "dockerfile",
81 "makefile",
82 "sql",
84 "graphql",
85 "prisma",
86 ]
87 .into_iter()
88 .collect()
89});
90
91static COLD_DIRS: Lazy<FxHashSet<&'static str>> = Lazy::new(|| {
93 [
94 "node_modules",
95 "__pycache__",
96 ".pytest_cache",
97 ".mypy_cache",
98 "target",
99 "build",
100 "dist",
101 ".git",
102 ".hg",
103 ".svn",
104 "vendor",
105 "third_party",
106 "external",
107 "deps",
108 ".idea",
109 ".vscode",
110 ".vs",
111 ".gradle",
112 ".maven",
113 "coverage",
114 ".coverage",
115 ".nyc_output",
116 "logs",
117 "tmp",
118 "temp",
119 ".tmp",
120 ".temp",
121 ]
122 .into_iter()
123 .collect()
124});
125
126static BINARY_MARKERS: Lazy<Vec<&'static [u8]>> = Lazy::new(|| {
128 vec![
129 b"\x7fELF", b"MZ", b"\xca\xfe\xba\xbe", b"\xfe\xed\xfa\xce", b"\x89PNG", b"\xff\xd8\xff", b"GIF8", b"RIFF", b"%PDF", b"PK\x03\x04", ]
140});
141
142const MAX_CONTENT_SIZE: u64 = 8 * 1024 * 1024;
144
145const BINARY_SAMPLE_SIZE: usize = 512;
147
148#[derive(Debug)]
150pub struct FileFilter {
151 allow_extensions: Option<FxHashSet<String>>,
153 deny_extensions: FxHashSet<String>,
155 max_file_size: u64,
157 include_hidden: bool,
159 binary_detection: bool,
161 stats: FilterStats,
163}
164
165#[derive(Debug, Default, Clone)]
167pub struct FilterStats {
168 pub files_walked: u64,
169 pub dirs_skipped: u64,
170 pub extension_filtered: u64,
171 pub size_filtered: u64,
172 pub binary_filtered: u64,
173 pub passed_filter: u64,
174 pub bytes_read_for_detection: u64,
175}
176
177#[derive(Debug, Clone, PartialEq, Eq)]
179pub enum FilterResult {
180 Include,
182 Exclude(FilterReason),
184}
185
186#[derive(Debug, Clone, PartialEq, Eq)]
188pub enum FilterReason {
189 ColdExtension,
190 ColdDirectory,
191 TooLarge(u64),
192 Hidden,
193 Binary,
194 CustomExtensionFilter,
195}
196
197impl FileFilter {
198 pub fn new() -> Self {
200 Self {
201 allow_extensions: None,
202 deny_extensions: FxHashSet::default(),
203 max_file_size: MAX_CONTENT_SIZE,
204 include_hidden: false,
205 binary_detection: true,
206 stats: FilterStats::default(),
207 }
208 }
209
210 pub fn with_allow_extensions(mut self, extensions: Vec<String>) -> Self {
212 self.allow_extensions = Some(extensions.into_iter().map(|e| e.to_lowercase()).collect());
213 self
214 }
215
216 pub fn with_deny_extensions(mut self, extensions: Vec<String>) -> Self {
218 self.deny_extensions = extensions.into_iter().map(|e| e.to_lowercase()).collect();
219 self
220 }
221
222 pub fn with_max_file_size(mut self, size: u64) -> Self {
224 self.max_file_size = size;
225 self
226 }
227
228 pub fn with_include_hidden(mut self, include: bool) -> Self {
230 self.include_hidden = include;
231 self
232 }
233
234 pub fn with_binary_detection(mut self, detect: bool) -> Self {
236 self.binary_detection = detect;
237 self
238 }
239
240 pub fn pre_filter_path(&mut self, path: &Path) -> FilterResult {
242 self.stats.files_walked += 1;
243
244 if !self.include_hidden {
246 if let Some(name) = path.file_name() {
247 if name.to_string_lossy().starts_with('.') {
248 return FilterResult::Exclude(FilterReason::Hidden);
249 }
250 }
251 }
252
253 for component in path.components() {
255 if let std::path::Component::Normal(name) = component {
256 if COLD_DIRS.contains(name.to_str().unwrap_or("")) {
257 self.stats.dirs_skipped += 1;
258 return FilterResult::Exclude(FilterReason::ColdDirectory);
259 }
260 }
261 }
262
263 let extension = path
265 .extension()
266 .and_then(|ext| ext.to_str())
267 .unwrap_or("")
268 .to_lowercase();
269
270 if let Some(ref allow_list) = self.allow_extensions {
272 if !allow_list.contains(&extension) {
273 self.stats.extension_filtered += 1;
274 return FilterResult::Exclude(FilterReason::CustomExtensionFilter);
275 }
276 }
277
278 if self.deny_extensions.contains(&extension) {
279 self.stats.extension_filtered += 1;
280 return FilterResult::Exclude(FilterReason::CustomExtensionFilter);
281 }
282
283 if COLD_EXTENSIONS.contains(extension.as_str()) {
285 self.stats.extension_filtered += 1;
286 return FilterResult::Exclude(FilterReason::ColdExtension);
287 }
288
289 FilterResult::Include
290 }
291
292 pub async fn filter_file(&mut self, path: &Path) -> FilterResult {
294 match self.pre_filter_path(path) {
296 FilterResult::Exclude(reason) => return FilterResult::Exclude(reason),
297 FilterResult::Include => {}
298 }
299
300 if let Ok(metadata) = tokio::fs::metadata(path).await {
302 if metadata.len() > self.max_file_size {
303 self.stats.size_filtered += 1;
304 return FilterResult::Exclude(FilterReason::TooLarge(metadata.len()));
305 }
306
307 if self.binary_detection && self.should_check_binary(path) {
309 if self.is_binary_file(path).await {
310 self.stats.binary_filtered += 1;
311 return FilterResult::Exclude(FilterReason::Binary);
312 }
313 }
314 }
315
316 self.stats.passed_filter += 1;
317 FilterResult::Include
318 }
319
320 fn should_check_binary(&self, path: &Path) -> bool {
322 let extension = path
323 .extension()
324 .and_then(|ext| ext.to_str())
325 .unwrap_or("")
326 .to_lowercase();
327
328 if HOT_EXTENSIONS.contains(extension.as_str()) {
330 return false;
331 }
332
333 if extension.is_empty() {
335 return false;
336 }
337
338 true
339 }
340
341 pub async fn is_binary_file(&mut self, path: &Path) -> bool {
343 match tokio::fs::File::open(path).await {
344 Ok(mut file) => {
345 use tokio::io::AsyncReadExt;
346
347 let mut buffer = vec![0u8; BINARY_SAMPLE_SIZE];
348 match file.read(&mut buffer).await {
349 Ok(bytes_read) => {
350 self.stats.bytes_read_for_detection += bytes_read as u64;
351 buffer.truncate(bytes_read);
352
353 self.detect_binary_content(&buffer)
354 }
355 Err(_) => false, }
357 }
358 Err(_) => false, }
360 }
361
362 fn detect_binary_content(&self, content: &[u8]) -> bool {
364 for marker in BINARY_MARKERS.iter() {
366 if content.starts_with(marker) {
367 return true;
368 }
369 }
370
371 if memchr::memchr(0, content).is_some() {
373 return true;
374 }
375
376 let non_printable = content
378 .iter()
379 .filter(|&&b| b < 32 && b != b'\t' && b != b'\n' && b != b'\r')
380 .count();
381
382 let ratio = non_printable as f64 / content.len() as f64;
383 ratio > 0.05 }
385
386 pub fn stats(&self) -> &FilterStats {
388 &self.stats
389 }
390
391 pub fn reset_stats(&mut self) {
393 self.stats = FilterStats::default();
394 }
395}
396
397impl Default for FileFilter {
398 fn default() -> Self {
399 Self::new()
400 }
401}
402
403#[derive(Debug)]
405pub struct DirectoryFilter {
406 cold_dirs: FxHashSet<String>,
407 stats: DirectoryFilterStats,
408}
409
410#[derive(Debug, Default)]
411pub struct DirectoryFilterStats {
412 pub dirs_walked: u64,
413 pub dirs_skipped: u64,
414}
415
416impl DirectoryFilter {
417 pub fn new() -> Self {
418 Self {
419 cold_dirs: COLD_DIRS.iter().map(|s| s.to_string()).collect(),
420 stats: DirectoryFilterStats::default(),
421 }
422 }
423
424 pub fn with_additional_cold_dirs(mut self, dirs: Vec<String>) -> Self {
425 self.cold_dirs.extend(dirs);
426 self
427 }
428
429 pub fn should_skip_directory(&mut self, path: &Path) -> bool {
431 self.stats.dirs_walked += 1;
432
433 if let Some(name) = path.file_name() {
434 if let Some(name_str) = name.to_str() {
435 if self.cold_dirs.contains(name_str) {
436 self.stats.dirs_skipped += 1;
437 return true;
438 }
439 }
440 }
441
442 false
443 }
444
445 pub fn stats(&self) -> &DirectoryFilterStats {
446 &self.stats
447 }
448}
449
450impl Default for DirectoryFilter {
451 fn default() -> Self {
452 Self::new()
453 }
454}
455
456#[cfg(test)]
457mod tests {
458 use super::*;
459 use tempfile::TempDir;
460 use tokio::fs;
461
462 #[tokio::test]
463 async fn test_cold_extension_filtering() {
464 let mut filter = FileFilter::new();
465
466 assert_eq!(
467 filter.pre_filter_path(Path::new("test.png")),
468 FilterResult::Exclude(FilterReason::ColdExtension)
469 );
470
471 assert_eq!(
472 filter.pre_filter_path(Path::new("code.rs")),
473 FilterResult::Include
474 );
475 }
476
477 #[tokio::test]
478 async fn test_cold_directory_filtering() {
479 let mut filter = FileFilter::new();
480
481 assert_eq!(
482 filter.pre_filter_path(Path::new("node_modules/package/index.js")),
483 FilterResult::Exclude(FilterReason::ColdDirectory)
484 );
485
486 assert_eq!(
487 filter.pre_filter_path(Path::new("src/main.rs")),
488 FilterResult::Include
489 );
490 }
491
492 #[tokio::test]
493 async fn test_custom_extension_filtering() {
494 let mut filter =
495 FileFilter::new().with_allow_extensions(vec!["rs".to_string(), "py".to_string()]);
496
497 assert_eq!(
498 filter.pre_filter_path(Path::new("test.js")),
499 FilterResult::Exclude(FilterReason::CustomExtensionFilter)
500 );
501
502 assert_eq!(
503 filter.pre_filter_path(Path::new("test.rs")),
504 FilterResult::Include
505 );
506 }
507
508 #[tokio::test]
509 async fn test_file_size_filtering() {
510 let large_file = Path::new("test_large_file.rs");
513
514 let content = "x".repeat(2000);
516 fs::write(&large_file, &content).await.unwrap();
517
518 let mut filter = FileFilter::new().with_max_file_size(1000);
519
520 let result = filter.filter_file(&large_file).await;
521
522 let _ = fs::remove_file(&large_file).await;
524
525 match result {
526 FilterResult::Exclude(FilterReason::TooLarge(_)) => {}
527 other => panic!("Expected TooLarge, got {:?}", other),
528 }
529 }
530
531 #[tokio::test]
532 async fn test_binary_detection() {
533 let temp_dir = TempDir::new().unwrap();
534
535 let test_dir = temp_dir.path().join("project");
537 fs::create_dir_all(&test_dir).await.unwrap();
538
539 let binary_file = test_dir.join("binary.dat");
541 fs::write(&binary_file, &[0u8, 1u8, 2u8, 0u8])
542 .await
543 .unwrap();
544
545 let text_file = test_dir.join("text.txt");
547 fs::write(&text_file, "Hello, world!").await.unwrap();
548
549 let mut filter = FileFilter::new();
550
551 assert!(filter.is_binary_file(&binary_file).await);
555 assert!(!filter.is_binary_file(&text_file).await);
556 }
557
558 #[tokio::test]
559 async fn test_hidden_file_filtering() {
560 let mut filter = FileFilter::new().with_include_hidden(false);
561
562 assert_eq!(
563 filter.pre_filter_path(Path::new(".hidden")),
564 FilterResult::Exclude(FilterReason::Hidden)
565 );
566
567 let mut filter = FileFilter::new().with_include_hidden(true);
568
569 assert_eq!(
570 filter.pre_filter_path(Path::new(".hidden")),
571 FilterResult::Include
572 );
573 }
574
575 #[test]
576 fn test_binary_content_detection() {
577 let filter = FileFilter::new();
578
579 assert!(filter.detect_binary_content(b"\x7fELF\x01\x01\x01"));
581
582 assert!(filter.detect_binary_content(b"%PDF-1.4\n"));
584
585 assert!(filter.detect_binary_content(b"text\x00more text"));
587
588 assert!(!filter.detect_binary_content(b"Hello, world!\n"));
590
591 assert!(!filter.detect_binary_content(b"fn main() {\n\tprintln!(\"Hello\");\n}"));
593 }
594
595 #[test]
596 fn test_directory_filtering() {
597 let mut dir_filter = DirectoryFilter::new();
598
599 assert!(dir_filter.should_skip_directory(Path::new("node_modules")));
600 assert!(dir_filter.should_skip_directory(Path::new("target")));
601 assert!(!dir_filter.should_skip_directory(Path::new("src")));
602
603 assert_eq!(dir_filter.stats().dirs_walked, 3);
604 assert_eq!(dir_filter.stats().dirs_skipped, 2);
605 }
606
607 #[test]
608 fn test_filter_statistics() {
609 let mut filter = FileFilter::new();
610
611 filter.pre_filter_path(Path::new("test.rs")); filter.pre_filter_path(Path::new("test.png")); filter.pre_filter_path(Path::new("node_modules/pkg/index.js")); filter.pre_filter_path(Path::new(".hidden")); let stats = filter.stats();
618 assert_eq!(stats.files_walked, 4);
619 assert_eq!(stats.extension_filtered, 1);
620 assert_eq!(stats.dirs_skipped, 1);
621 assert_eq!(stats.passed_filter, 0); }
623}