scribe_scanner/
filtering.rs

1//! High-performance file filtering with early content reads and strict pre-filtering.
2//!
3//! This module implements the performance-critical pre-filtering logic that dramatically
4//! reduces work by eliminating files before expensive operations like content analysis,
5//! git lookups, and heuristic computation.
6
7use fxhash::FxHashSet;
8use memchr::memmem;
9use once_cell::sync::Lazy;
10use scribe_core::FileInfo;
11use std::collections::HashSet;
12use std::path::{Path, PathBuf};
13
14/// Cold file extensions that should be filtered out early
15static COLD_EXTENSIONS: Lazy<FxHashSet<&'static str>> = Lazy::new(|| {
16    [
17        // Documentation that's rarely code-relevant
18        "md", "txt", "rst", "adoc", "wiki", // Media files
19        "png", "jpg", "jpeg", "gif", "bmp", "ico", "svg", "webp", "tiff", "mp3", "mp4", "avi",
20        "mkv", "mov", "wmv", "flv", "webm", "m4v", "wav", "flac", "ogg", "aac", "wma",
21        // Archives and packages
22        "zip", "tar", "gz", "bz2", "xz", "7z", "rar", "jar", "war", "ear",
23        // Binary executables
24        "exe", "dll", "so", "dylib", "a", "lib", "bin", "out", // Office documents
25        "pdf", "doc", "docx", "xls", "xlsx", "ppt", "pptx", "odt", "ods", "odp",
26        // Fonts
27        "ttf", "otf", "woff", "woff2", "eot", // Cache/temp files
28        "tmp", "temp", "cache", "log", "bak", "swp", "swo", // Generated/minified
29        "min.js", "min.css",
30    ]
31    .into_iter()
32    .collect()
33});
34
35/// Hot file extensions that are likely to contain important code
36static HOT_EXTENSIONS: Lazy<FxHashSet<&'static str>> = Lazy::new(|| {
37    [
38        // Core programming languages
39        "rs",
40        "py",
41        "js",
42        "ts",
43        "jsx",
44        "tsx",
45        "go",
46        "java",
47        "c",
48        "cpp",
49        "h",
50        "hpp",
51        "cs",
52        "php",
53        "rb",
54        "swift",
55        "kt",
56        "scala",
57        "clj",
58        "hs",
59        "elm",
60        "ml",
61        "ocaml",
62        // Configuration and markup with logic
63        "json",
64        "yaml",
65        "yml",
66        "toml",
67        "xml",
68        "html",
69        "css",
70        "scss",
71        "less",
72        "sass",
73        // Scripts and configs
74        "sh",
75        "bash",
76        "zsh",
77        "fish",
78        "ps1",
79        "cmd",
80        "bat",
81        "dockerfile",
82        "makefile",
83        // Database and query languages
84        "sql",
85        "graphql",
86        "prisma",
87    ]
88    .into_iter()
89    .collect()
90});
91
92/// Vendor/generated directory patterns to skip entirely
93static COLD_DIRS: Lazy<FxHashSet<&'static str>> = Lazy::new(|| {
94    [
95        "node_modules",
96        "__pycache__",
97        ".pytest_cache",
98        ".mypy_cache",
99        "target",
100        "build",
101        "dist",
102        ".git",
103        ".hg",
104        ".svn",
105        "vendor",
106        "third_party",
107        "external",
108        "deps",
109        ".idea",
110        ".vscode",
111        ".vs",
112        ".gradle",
113        ".maven",
114        "coverage",
115        ".coverage",
116        ".nyc_output",
117        "logs",
118        "tmp",
119        "temp",
120        ".tmp",
121        ".temp",
122    ]
123    .into_iter()
124    .collect()
125});
126
127/// Binary content detection patterns (first 512 bytes)
128static BINARY_MARKERS: Lazy<Vec<&'static [u8]>> = Lazy::new(|| {
129    vec![
130        b"\x7fELF",          // ELF binaries
131        b"MZ",               // Windows PE
132        b"\xca\xfe\xba\xbe", // Java class files
133        b"\xfe\xed\xfa\xce", // Mach-O binaries
134        b"\x89PNG",          // PNG images
135        b"\xff\xd8\xff",     // JPEG images
136        b"GIF8",             // GIF images
137        b"RIFF",             // WAV/AVI files
138        b"%PDF",             // PDF files
139        b"PK\x03\x04",       // ZIP files
140    ]
141});
142
143/// Maximum file size for content-based analysis (8MB)
144const MAX_CONTENT_SIZE: u64 = 8 * 1024 * 1024;
145
146/// Size for binary detection sample (512 bytes)
147const BINARY_SAMPLE_SIZE: usize = 512;
148
149/// High-performance file filter with strict pre-filtering
150#[derive(Debug)]
151pub struct FileFilter {
152    /// Custom extension allowlist (if set, only these are allowed)
153    allow_extensions: Option<FxHashSet<String>>,
154    /// Custom extension denylist (these are always blocked)  
155    deny_extensions: FxHashSet<String>,
156    /// Maximum file size to process
157    max_file_size: u64,
158    /// Whether to include hidden files
159    include_hidden: bool,
160    /// Whether to perform binary content detection
161    binary_detection: bool,
162    /// Performance counters
163    stats: FilterStats,
164}
165
166/// Performance statistics for filtering operations
167#[derive(Debug, Default, Clone)]
168pub struct FilterStats {
169    pub files_walked: u64,
170    pub dirs_skipped: u64,
171    pub extension_filtered: u64,
172    pub size_filtered: u64,
173    pub binary_filtered: u64,
174    pub passed_filter: u64,
175    pub bytes_read_for_detection: u64,
176}
177
178/// Result of pre-filtering a single file
179#[derive(Debug, Clone, PartialEq, Eq)]
180pub enum FilterResult {
181    /// File should be processed
182    Include,
183    /// File should be skipped with reason
184    Exclude(FilterReason),
185}
186
187/// Reasons for filtering out files
188#[derive(Debug, Clone, PartialEq, Eq)]
189pub enum FilterReason {
190    ColdExtension,
191    ColdDirectory,
192    TooLarge(u64),
193    Hidden,
194    Binary,
195    CustomExtensionFilter,
196}
197
198impl FileFilter {
199    /// Create a new file filter with performance-optimized defaults
200    pub fn new() -> Self {
201        Self {
202            allow_extensions: None,
203            deny_extensions: FxHashSet::default(),
204            max_file_size: MAX_CONTENT_SIZE,
205            include_hidden: false,
206            binary_detection: true,
207            stats: FilterStats::default(),
208        }
209    }
210
211    /// Set custom extension allowlist (only these extensions will be processed)
212    pub fn with_allow_extensions(mut self, extensions: Vec<String>) -> Self {
213        self.allow_extensions = Some(extensions.into_iter().map(|e| e.to_lowercase()).collect());
214        self
215    }
216
217    /// Add extensions to the deny list
218    pub fn with_deny_extensions(mut self, extensions: Vec<String>) -> Self {
219        self.deny_extensions = extensions.into_iter().map(|e| e.to_lowercase()).collect();
220        self
221    }
222
223    /// Set maximum file size
224    pub fn with_max_file_size(mut self, size: u64) -> Self {
225        self.max_file_size = size;
226        self
227    }
228
229    /// Set whether to include hidden files
230    pub fn with_include_hidden(mut self, include: bool) -> Self {
231        self.include_hidden = include;
232        self
233    }
234
235    /// Set whether to perform binary detection
236    pub fn with_binary_detection(mut self, detect: bool) -> Self {
237        self.binary_detection = detect;
238        self
239    }
240
241    /// Pre-filter a file path without reading contents
242    pub fn pre_filter_path(&mut self, path: &Path) -> FilterResult {
243        self.stats.files_walked += 1;
244
245        // Check hidden files
246        if !self.include_hidden {
247            if let Some(name) = path.file_name() {
248                if name.to_string_lossy().starts_with('.') {
249                    return FilterResult::Exclude(FilterReason::Hidden);
250                }
251            }
252        }
253
254        // Check for cold directories in path
255        for component in path.components() {
256            if let std::path::Component::Normal(name) = component {
257                if COLD_DIRS.contains(name.to_str().unwrap_or("")) {
258                    self.stats.dirs_skipped += 1;
259                    return FilterResult::Exclude(FilterReason::ColdDirectory);
260                }
261            }
262        }
263
264        // Get file extension
265        let extension = path
266            .extension()
267            .and_then(|ext| ext.to_str())
268            .unwrap_or("")
269            .to_lowercase();
270
271        // Apply custom extension filters
272        if let Some(ref allow_list) = self.allow_extensions {
273            if !allow_list.contains(&extension) {
274                self.stats.extension_filtered += 1;
275                return FilterResult::Exclude(FilterReason::CustomExtensionFilter);
276            }
277        }
278
279        if self.deny_extensions.contains(&extension) {
280            self.stats.extension_filtered += 1;
281            return FilterResult::Exclude(FilterReason::CustomExtensionFilter);
282        }
283
284        // Check against cold extensions
285        if COLD_EXTENSIONS.contains(extension.as_str()) {
286            self.stats.extension_filtered += 1;
287            return FilterResult::Exclude(FilterReason::ColdExtension);
288        }
289
290        FilterResult::Include
291    }
292
293    /// Full filter including file size and binary detection
294    pub async fn filter_file(&mut self, path: &Path) -> FilterResult {
295        // First apply path-based filtering
296        match self.pre_filter_path(path) {
297            FilterResult::Exclude(reason) => return FilterResult::Exclude(reason),
298            FilterResult::Include => {}
299        }
300
301        // Check file size
302        if let Ok(metadata) = tokio::fs::metadata(path).await {
303            if metadata.len() > self.max_file_size {
304                self.stats.size_filtered += 1;
305                return FilterResult::Exclude(FilterReason::TooLarge(metadata.len()));
306            }
307
308            // Binary detection if enabled
309            if self.binary_detection && self.should_check_binary(path) {
310                if self.is_binary_file(path).await {
311                    self.stats.binary_filtered += 1;
312                    return FilterResult::Exclude(FilterReason::Binary);
313                }
314            }
315        }
316
317        self.stats.passed_filter += 1;
318        FilterResult::Include
319    }
320
321    /// Check if we should perform binary detection for this file
322    fn should_check_binary(&self, path: &Path) -> bool {
323        let extension = path
324            .extension()
325            .and_then(|ext| ext.to_str())
326            .unwrap_or("")
327            .to_lowercase();
328
329        // Skip binary detection for known text extensions
330        if HOT_EXTENSIONS.contains(extension.as_str()) {
331            return false;
332        }
333
334        // Skip for files with no extension (often text)
335        if extension.is_empty() {
336            return false;
337        }
338
339        true
340    }
341
342    /// Fast binary file detection using content sampling
343    pub async fn is_binary_file(&mut self, path: &Path) -> bool {
344        match tokio::fs::File::open(path).await {
345            Ok(mut file) => {
346                use tokio::io::AsyncReadExt;
347
348                let mut buffer = vec![0u8; BINARY_SAMPLE_SIZE];
349                match file.read(&mut buffer).await {
350                    Ok(bytes_read) => {
351                        self.stats.bytes_read_for_detection += bytes_read as u64;
352                        buffer.truncate(bytes_read);
353
354                        let extension = path.extension().and_then(|ext| ext.to_str());
355
356                        if FileInfo::detect_binary_from_bytes(&buffer, extension) {
357                            return true;
358                        }
359
360                        self.detect_binary_content(&buffer)
361                    }
362                    Err(_) => false, // Assume text if we can't read
363                }
364            }
365            Err(_) => false, // Assume text if we can't open
366        }
367    }
368
369    /// Detect binary content using multiple heuristics
370    fn detect_binary_content(&self, content: &[u8]) -> bool {
371        // Check for known binary markers
372        for marker in BINARY_MARKERS.iter() {
373            if content.starts_with(marker) {
374                return true;
375            }
376        }
377
378        // Null byte check (classic binary detection)
379        if memchr::memchr(0, content).is_some() {
380            return true;
381        }
382
383        // High percentage of non-printable bytes
384        let non_printable = content
385            .iter()
386            .filter(|&&b| b < 32 && b != b'\t' && b != b'\n' && b != b'\r')
387            .count();
388
389        let ratio = non_printable as f64 / content.len() as f64;
390        ratio > 0.05 // More than 5% non-printable
391    }
392
393    /// Get filtering statistics
394    pub fn stats(&self) -> &FilterStats {
395        &self.stats
396    }
397
398    /// Reset statistics
399    pub fn reset_stats(&mut self) {
400        self.stats = FilterStats::default();
401    }
402}
403
404impl Default for FileFilter {
405    fn default() -> Self {
406        Self::new()
407    }
408}
409
410/// Directory-level filtering for efficient tree traversal
411#[derive(Debug)]
412pub struct DirectoryFilter {
413    cold_dirs: FxHashSet<String>,
414    stats: DirectoryFilterStats,
415}
416
417#[derive(Debug, Default)]
418pub struct DirectoryFilterStats {
419    pub dirs_walked: u64,
420    pub dirs_skipped: u64,
421}
422
423impl DirectoryFilter {
424    pub fn new() -> Self {
425        Self {
426            cold_dirs: COLD_DIRS.iter().map(|s| s.to_string()).collect(),
427            stats: DirectoryFilterStats::default(),
428        }
429    }
430
431    pub fn with_additional_cold_dirs(mut self, dirs: Vec<String>) -> Self {
432        self.cold_dirs.extend(dirs);
433        self
434    }
435
436    /// Check if a directory should be skipped entirely
437    pub fn should_skip_directory(&mut self, path: &Path) -> bool {
438        self.stats.dirs_walked += 1;
439
440        if let Some(name) = path.file_name() {
441            if let Some(name_str) = name.to_str() {
442                if self.cold_dirs.contains(name_str) {
443                    self.stats.dirs_skipped += 1;
444                    return true;
445                }
446            }
447        }
448
449        false
450    }
451
452    pub fn stats(&self) -> &DirectoryFilterStats {
453        &self.stats
454    }
455}
456
457impl Default for DirectoryFilter {
458    fn default() -> Self {
459        Self::new()
460    }
461}
462
463#[cfg(test)]
464mod tests {
465    use super::*;
466    use tempfile::TempDir;
467    use tokio::fs;
468
469    #[tokio::test]
470    async fn test_cold_extension_filtering() {
471        let mut filter = FileFilter::new();
472
473        assert_eq!(
474            filter.pre_filter_path(Path::new("test.png")),
475            FilterResult::Exclude(FilterReason::ColdExtension)
476        );
477
478        assert_eq!(
479            filter.pre_filter_path(Path::new("code.rs")),
480            FilterResult::Include
481        );
482    }
483
484    #[tokio::test]
485    async fn test_cold_directory_filtering() {
486        let mut filter = FileFilter::new();
487
488        assert_eq!(
489            filter.pre_filter_path(Path::new("node_modules/package/index.js")),
490            FilterResult::Exclude(FilterReason::ColdDirectory)
491        );
492
493        assert_eq!(
494            filter.pre_filter_path(Path::new("src/main.rs")),
495            FilterResult::Include
496        );
497    }
498
499    #[tokio::test]
500    async fn test_custom_extension_filtering() {
501        let mut filter =
502            FileFilter::new().with_allow_extensions(vec!["rs".to_string(), "py".to_string()]);
503
504        assert_eq!(
505            filter.pre_filter_path(Path::new("test.js")),
506            FilterResult::Exclude(FilterReason::CustomExtensionFilter)
507        );
508
509        assert_eq!(
510            filter.pre_filter_path(Path::new("test.rs")),
511            FilterResult::Include
512        );
513    }
514
515    #[tokio::test]
516    async fn test_file_size_filtering() {
517        // Create test file in current directory to avoid tmp path issues
518        // Use .rs extension which is in HOT_EXTENSIONS, not COLD_EXTENSIONS
519        let large_file = Path::new("test_large_file.rs");
520
521        // Create a file larger than 1KB
522        let content = "x".repeat(2000);
523        fs::write(&large_file, &content).await.unwrap();
524
525        let mut filter = FileFilter::new().with_max_file_size(1000);
526
527        let result = filter.filter_file(&large_file).await;
528
529        // Clean up test file
530        let _ = fs::remove_file(&large_file).await;
531
532        match result {
533            FilterResult::Exclude(FilterReason::TooLarge(_)) => {}
534            other => panic!("Expected TooLarge, got {:?}", other),
535        }
536    }
537
538    #[tokio::test]
539    async fn test_binary_detection() {
540        let temp_dir = TempDir::new().unwrap();
541
542        // Create a subdirectory that won't match COLD_DIRS
543        let test_dir = temp_dir.path().join("project");
544        fs::create_dir_all(&test_dir).await.unwrap();
545
546        // Create a binary file with null bytes
547        let binary_file = test_dir.join("binary.dat");
548        fs::write(&binary_file, &[0u8, 1u8, 2u8, 0u8])
549            .await
550            .unwrap();
551
552        // Create a text file
553        let text_file = test_dir.join("text.txt");
554        fs::write(&text_file, "Hello, world!").await.unwrap();
555
556        let mut filter = FileFilter::new();
557
558        // Test that binary files are detected correctly
559        // Since the temp dir path contains "tmp", we need to test binary detection
560        // on files that don't get filtered by cold directory first
561        assert!(filter.is_binary_file(&binary_file).await);
562        assert!(!filter.is_binary_file(&text_file).await);
563    }
564
565    #[tokio::test]
566    async fn test_hidden_file_filtering() {
567        let mut filter = FileFilter::new().with_include_hidden(false);
568
569        assert_eq!(
570            filter.pre_filter_path(Path::new(".hidden")),
571            FilterResult::Exclude(FilterReason::Hidden)
572        );
573
574        let mut filter = FileFilter::new().with_include_hidden(true);
575
576        assert_eq!(
577            filter.pre_filter_path(Path::new(".hidden")),
578            FilterResult::Include
579        );
580    }
581
582    #[test]
583    fn test_binary_content_detection() {
584        let filter = FileFilter::new();
585
586        // ELF binary
587        assert!(filter.detect_binary_content(b"\x7fELF\x01\x01\x01"));
588
589        // PDF file
590        assert!(filter.detect_binary_content(b"%PDF-1.4\n"));
591
592        // File with null bytes
593        assert!(filter.detect_binary_content(b"text\x00more text"));
594
595        // Regular text
596        assert!(!filter.detect_binary_content(b"Hello, world!\n"));
597
598        // Text with tabs and newlines
599        assert!(!filter.detect_binary_content(b"fn main() {\n\tprintln!(\"Hello\");\n}"));
600    }
601
602    #[test]
603    fn test_directory_filtering() {
604        let mut dir_filter = DirectoryFilter::new();
605
606        assert!(dir_filter.should_skip_directory(Path::new("node_modules")));
607        assert!(dir_filter.should_skip_directory(Path::new("target")));
608        assert!(!dir_filter.should_skip_directory(Path::new("src")));
609
610        assert_eq!(dir_filter.stats().dirs_walked, 3);
611        assert_eq!(dir_filter.stats().dirs_skipped, 2);
612    }
613
614    #[test]
615    fn test_filter_statistics() {
616        let mut filter = FileFilter::new();
617
618        // Test various filtering scenarios
619        filter.pre_filter_path(Path::new("test.rs")); // Include
620        filter.pre_filter_path(Path::new("test.png")); // Cold extension
621        filter.pre_filter_path(Path::new("node_modules/pkg/index.js")); // Cold dir
622        filter.pre_filter_path(Path::new(".hidden")); // Hidden
623
624        let stats = filter.stats();
625        assert_eq!(stats.files_walked, 4);
626        assert_eq!(stats.extension_filtered, 1);
627        assert_eq!(stats.dirs_skipped, 1);
628        assert_eq!(stats.passed_filter, 0); // pre_filter_path doesn't update passed_filter
629    }
630}