1use fxhash::FxHashSet;
8use memchr::memmem;
9use once_cell::sync::Lazy;
10use scribe_core::FileInfo;
11use std::collections::HashSet;
12use std::path::{Path, PathBuf};
13
14static COLD_EXTENSIONS: Lazy<FxHashSet<&'static str>> = Lazy::new(|| {
16 [
17 "md", "txt", "rst", "adoc", "wiki", "png", "jpg", "jpeg", "gif", "bmp", "ico", "svg", "webp", "tiff", "mp3", "mp4", "avi",
20 "mkv", "mov", "wmv", "flv", "webm", "m4v", "wav", "flac", "ogg", "aac", "wma",
21 "zip", "tar", "gz", "bz2", "xz", "7z", "rar", "jar", "war", "ear",
23 "exe", "dll", "so", "dylib", "a", "lib", "bin", "out", "pdf", "doc", "docx", "xls", "xlsx", "ppt", "pptx", "odt", "ods", "odp",
26 "ttf", "otf", "woff", "woff2", "eot", "tmp", "temp", "cache", "log", "bak", "swp", "swo", "min.js", "min.css",
30 ]
31 .into_iter()
32 .collect()
33});
34
35static HOT_EXTENSIONS: Lazy<FxHashSet<&'static str>> = Lazy::new(|| {
37 [
38 "rs",
40 "py",
41 "js",
42 "ts",
43 "jsx",
44 "tsx",
45 "go",
46 "java",
47 "c",
48 "cpp",
49 "h",
50 "hpp",
51 "cs",
52 "php",
53 "rb",
54 "swift",
55 "kt",
56 "scala",
57 "clj",
58 "hs",
59 "elm",
60 "ml",
61 "ocaml",
62 "json",
64 "yaml",
65 "yml",
66 "toml",
67 "xml",
68 "html",
69 "css",
70 "scss",
71 "less",
72 "sass",
73 "sh",
75 "bash",
76 "zsh",
77 "fish",
78 "ps1",
79 "cmd",
80 "bat",
81 "dockerfile",
82 "makefile",
83 "sql",
85 "graphql",
86 "prisma",
87 ]
88 .into_iter()
89 .collect()
90});
91
92static COLD_DIRS: Lazy<FxHashSet<&'static str>> = Lazy::new(|| {
94 [
95 "node_modules",
96 "__pycache__",
97 ".pytest_cache",
98 ".mypy_cache",
99 "target",
100 "build",
101 "dist",
102 ".git",
103 ".hg",
104 ".svn",
105 "vendor",
106 "third_party",
107 "external",
108 "deps",
109 ".idea",
110 ".vscode",
111 ".vs",
112 ".gradle",
113 ".maven",
114 "coverage",
115 ".coverage",
116 ".nyc_output",
117 "logs",
118 "tmp",
119 "temp",
120 ".tmp",
121 ".temp",
122 ]
123 .into_iter()
124 .collect()
125});
126
127static BINARY_MARKERS: Lazy<Vec<&'static [u8]>> = Lazy::new(|| {
129 vec![
130 b"\x7fELF", b"MZ", b"\xca\xfe\xba\xbe", b"\xfe\xed\xfa\xce", b"\x89PNG", b"\xff\xd8\xff", b"GIF8", b"RIFF", b"%PDF", b"PK\x03\x04", ]
141});
142
143const MAX_CONTENT_SIZE: u64 = 8 * 1024 * 1024;
145
146const BINARY_SAMPLE_SIZE: usize = 512;
148
149#[derive(Debug)]
151pub struct FileFilter {
152 allow_extensions: Option<FxHashSet<String>>,
154 deny_extensions: FxHashSet<String>,
156 max_file_size: u64,
158 include_hidden: bool,
160 binary_detection: bool,
162 stats: FilterStats,
164}
165
166#[derive(Debug, Default, Clone)]
168pub struct FilterStats {
169 pub files_walked: u64,
170 pub dirs_skipped: u64,
171 pub extension_filtered: u64,
172 pub size_filtered: u64,
173 pub binary_filtered: u64,
174 pub passed_filter: u64,
175 pub bytes_read_for_detection: u64,
176}
177
178#[derive(Debug, Clone, PartialEq, Eq)]
180pub enum FilterResult {
181 Include,
183 Exclude(FilterReason),
185}
186
187#[derive(Debug, Clone, PartialEq, Eq)]
189pub enum FilterReason {
190 ColdExtension,
191 ColdDirectory,
192 TooLarge(u64),
193 Hidden,
194 Binary,
195 CustomExtensionFilter,
196}
197
198impl FileFilter {
199 pub fn new() -> Self {
201 Self {
202 allow_extensions: None,
203 deny_extensions: FxHashSet::default(),
204 max_file_size: MAX_CONTENT_SIZE,
205 include_hidden: false,
206 binary_detection: true,
207 stats: FilterStats::default(),
208 }
209 }
210
211 pub fn with_allow_extensions(mut self, extensions: Vec<String>) -> Self {
213 self.allow_extensions = Some(extensions.into_iter().map(|e| e.to_lowercase()).collect());
214 self
215 }
216
217 pub fn with_deny_extensions(mut self, extensions: Vec<String>) -> Self {
219 self.deny_extensions = extensions.into_iter().map(|e| e.to_lowercase()).collect();
220 self
221 }
222
223 pub fn with_max_file_size(mut self, size: u64) -> Self {
225 self.max_file_size = size;
226 self
227 }
228
229 pub fn with_include_hidden(mut self, include: bool) -> Self {
231 self.include_hidden = include;
232 self
233 }
234
235 pub fn with_binary_detection(mut self, detect: bool) -> Self {
237 self.binary_detection = detect;
238 self
239 }
240
241 pub fn pre_filter_path(&mut self, path: &Path) -> FilterResult {
243 self.stats.files_walked += 1;
244
245 if !self.include_hidden {
247 if let Some(name) = path.file_name() {
248 if name.to_string_lossy().starts_with('.') {
249 return FilterResult::Exclude(FilterReason::Hidden);
250 }
251 }
252 }
253
254 for component in path.components() {
256 if let std::path::Component::Normal(name) = component {
257 if COLD_DIRS.contains(name.to_str().unwrap_or("")) {
258 self.stats.dirs_skipped += 1;
259 return FilterResult::Exclude(FilterReason::ColdDirectory);
260 }
261 }
262 }
263
264 let extension = path
266 .extension()
267 .and_then(|ext| ext.to_str())
268 .unwrap_or("")
269 .to_lowercase();
270
271 if let Some(ref allow_list) = self.allow_extensions {
273 if !allow_list.contains(&extension) {
274 self.stats.extension_filtered += 1;
275 return FilterResult::Exclude(FilterReason::CustomExtensionFilter);
276 }
277 }
278
279 if self.deny_extensions.contains(&extension) {
280 self.stats.extension_filtered += 1;
281 return FilterResult::Exclude(FilterReason::CustomExtensionFilter);
282 }
283
284 if COLD_EXTENSIONS.contains(extension.as_str()) {
286 self.stats.extension_filtered += 1;
287 return FilterResult::Exclude(FilterReason::ColdExtension);
288 }
289
290 FilterResult::Include
291 }
292
293 pub async fn filter_file(&mut self, path: &Path) -> FilterResult {
295 match self.pre_filter_path(path) {
297 FilterResult::Exclude(reason) => return FilterResult::Exclude(reason),
298 FilterResult::Include => {}
299 }
300
301 if let Ok(metadata) = tokio::fs::metadata(path).await {
303 if metadata.len() > self.max_file_size {
304 self.stats.size_filtered += 1;
305 return FilterResult::Exclude(FilterReason::TooLarge(metadata.len()));
306 }
307
308 if self.binary_detection && self.should_check_binary(path) {
310 if self.is_binary_file(path).await {
311 self.stats.binary_filtered += 1;
312 return FilterResult::Exclude(FilterReason::Binary);
313 }
314 }
315 }
316
317 self.stats.passed_filter += 1;
318 FilterResult::Include
319 }
320
321 fn should_check_binary(&self, path: &Path) -> bool {
323 let extension = path
324 .extension()
325 .and_then(|ext| ext.to_str())
326 .unwrap_or("")
327 .to_lowercase();
328
329 if HOT_EXTENSIONS.contains(extension.as_str()) {
331 return false;
332 }
333
334 if extension.is_empty() {
336 return false;
337 }
338
339 true
340 }
341
342 pub async fn is_binary_file(&mut self, path: &Path) -> bool {
344 match tokio::fs::File::open(path).await {
345 Ok(mut file) => {
346 use tokio::io::AsyncReadExt;
347
348 let mut buffer = vec![0u8; BINARY_SAMPLE_SIZE];
349 match file.read(&mut buffer).await {
350 Ok(bytes_read) => {
351 self.stats.bytes_read_for_detection += bytes_read as u64;
352 buffer.truncate(bytes_read);
353
354 let extension = path.extension().and_then(|ext| ext.to_str());
355
356 if FileInfo::detect_binary_from_bytes(&buffer, extension) {
357 return true;
358 }
359
360 self.detect_binary_content(&buffer)
361 }
362 Err(_) => false, }
364 }
365 Err(_) => false, }
367 }
368
369 fn detect_binary_content(&self, content: &[u8]) -> bool {
371 for marker in BINARY_MARKERS.iter() {
373 if content.starts_with(marker) {
374 return true;
375 }
376 }
377
378 if memchr::memchr(0, content).is_some() {
380 return true;
381 }
382
383 let non_printable = content
385 .iter()
386 .filter(|&&b| b < 32 && b != b'\t' && b != b'\n' && b != b'\r')
387 .count();
388
389 let ratio = non_printable as f64 / content.len() as f64;
390 ratio > 0.05 }
392
393 pub fn stats(&self) -> &FilterStats {
395 &self.stats
396 }
397
398 pub fn reset_stats(&mut self) {
400 self.stats = FilterStats::default();
401 }
402}
403
404impl Default for FileFilter {
405 fn default() -> Self {
406 Self::new()
407 }
408}
409
410#[derive(Debug)]
412pub struct DirectoryFilter {
413 cold_dirs: FxHashSet<String>,
414 stats: DirectoryFilterStats,
415}
416
417#[derive(Debug, Default)]
418pub struct DirectoryFilterStats {
419 pub dirs_walked: u64,
420 pub dirs_skipped: u64,
421}
422
423impl DirectoryFilter {
424 pub fn new() -> Self {
425 Self {
426 cold_dirs: COLD_DIRS.iter().map(|s| s.to_string()).collect(),
427 stats: DirectoryFilterStats::default(),
428 }
429 }
430
431 pub fn with_additional_cold_dirs(mut self, dirs: Vec<String>) -> Self {
432 self.cold_dirs.extend(dirs);
433 self
434 }
435
436 pub fn should_skip_directory(&mut self, path: &Path) -> bool {
438 self.stats.dirs_walked += 1;
439
440 if let Some(name) = path.file_name() {
441 if let Some(name_str) = name.to_str() {
442 if self.cold_dirs.contains(name_str) {
443 self.stats.dirs_skipped += 1;
444 return true;
445 }
446 }
447 }
448
449 false
450 }
451
452 pub fn stats(&self) -> &DirectoryFilterStats {
453 &self.stats
454 }
455}
456
457impl Default for DirectoryFilter {
458 fn default() -> Self {
459 Self::new()
460 }
461}
462
463#[cfg(test)]
464mod tests {
465 use super::*;
466 use tempfile::TempDir;
467 use tokio::fs;
468
469 #[tokio::test]
470 async fn test_cold_extension_filtering() {
471 let mut filter = FileFilter::new();
472
473 assert_eq!(
474 filter.pre_filter_path(Path::new("test.png")),
475 FilterResult::Exclude(FilterReason::ColdExtension)
476 );
477
478 assert_eq!(
479 filter.pre_filter_path(Path::new("code.rs")),
480 FilterResult::Include
481 );
482 }
483
484 #[tokio::test]
485 async fn test_cold_directory_filtering() {
486 let mut filter = FileFilter::new();
487
488 assert_eq!(
489 filter.pre_filter_path(Path::new("node_modules/package/index.js")),
490 FilterResult::Exclude(FilterReason::ColdDirectory)
491 );
492
493 assert_eq!(
494 filter.pre_filter_path(Path::new("src/main.rs")),
495 FilterResult::Include
496 );
497 }
498
499 #[tokio::test]
500 async fn test_custom_extension_filtering() {
501 let mut filter =
502 FileFilter::new().with_allow_extensions(vec!["rs".to_string(), "py".to_string()]);
503
504 assert_eq!(
505 filter.pre_filter_path(Path::new("test.js")),
506 FilterResult::Exclude(FilterReason::CustomExtensionFilter)
507 );
508
509 assert_eq!(
510 filter.pre_filter_path(Path::new("test.rs")),
511 FilterResult::Include
512 );
513 }
514
515 #[tokio::test]
516 async fn test_file_size_filtering() {
517 let large_file = Path::new("test_large_file.rs");
520
521 let content = "x".repeat(2000);
523 fs::write(&large_file, &content).await.unwrap();
524
525 let mut filter = FileFilter::new().with_max_file_size(1000);
526
527 let result = filter.filter_file(&large_file).await;
528
529 let _ = fs::remove_file(&large_file).await;
531
532 match result {
533 FilterResult::Exclude(FilterReason::TooLarge(_)) => {}
534 other => panic!("Expected TooLarge, got {:?}", other),
535 }
536 }
537
538 #[tokio::test]
539 async fn test_binary_detection() {
540 let temp_dir = TempDir::new().unwrap();
541
542 let test_dir = temp_dir.path().join("project");
544 fs::create_dir_all(&test_dir).await.unwrap();
545
546 let binary_file = test_dir.join("binary.dat");
548 fs::write(&binary_file, &[0u8, 1u8, 2u8, 0u8])
549 .await
550 .unwrap();
551
552 let text_file = test_dir.join("text.txt");
554 fs::write(&text_file, "Hello, world!").await.unwrap();
555
556 let mut filter = FileFilter::new();
557
558 assert!(filter.is_binary_file(&binary_file).await);
562 assert!(!filter.is_binary_file(&text_file).await);
563 }
564
565 #[tokio::test]
566 async fn test_hidden_file_filtering() {
567 let mut filter = FileFilter::new().with_include_hidden(false);
568
569 assert_eq!(
570 filter.pre_filter_path(Path::new(".hidden")),
571 FilterResult::Exclude(FilterReason::Hidden)
572 );
573
574 let mut filter = FileFilter::new().with_include_hidden(true);
575
576 assert_eq!(
577 filter.pre_filter_path(Path::new(".hidden")),
578 FilterResult::Include
579 );
580 }
581
582 #[test]
583 fn test_binary_content_detection() {
584 let filter = FileFilter::new();
585
586 assert!(filter.detect_binary_content(b"\x7fELF\x01\x01\x01"));
588
589 assert!(filter.detect_binary_content(b"%PDF-1.4\n"));
591
592 assert!(filter.detect_binary_content(b"text\x00more text"));
594
595 assert!(!filter.detect_binary_content(b"Hello, world!\n"));
597
598 assert!(!filter.detect_binary_content(b"fn main() {\n\tprintln!(\"Hello\");\n}"));
600 }
601
602 #[test]
603 fn test_directory_filtering() {
604 let mut dir_filter = DirectoryFilter::new();
605
606 assert!(dir_filter.should_skip_directory(Path::new("node_modules")));
607 assert!(dir_filter.should_skip_directory(Path::new("target")));
608 assert!(!dir_filter.should_skip_directory(Path::new("src")));
609
610 assert_eq!(dir_filter.stats().dirs_walked, 3);
611 assert_eq!(dir_filter.stats().dirs_skipped, 2);
612 }
613
614 #[test]
615 fn test_filter_statistics() {
616 let mut filter = FileFilter::new();
617
618 filter.pre_filter_path(Path::new("test.rs")); filter.pre_filter_path(Path::new("test.png")); filter.pre_filter_path(Path::new("node_modules/pkg/index.js")); filter.pre_filter_path(Path::new(".hidden")); let stats = filter.stats();
625 assert_eq!(stats.files_walked, 4);
626 assert_eq!(stats.extension_filtered, 1);
627 assert_eq!(stats.dirs_skipped, 1);
628 assert_eq!(stats.passed_filter, 0); }
630}