scribe_scanner/
filtering.rs

1//! High-performance file filtering with early content reads and strict pre-filtering.
2//!
3//! This module implements the performance-critical pre-filtering logic that dramatically
4//! reduces work by eliminating files before expensive operations like content analysis,
5//! git lookups, and heuristic computation.
6
7use fxhash::FxHashSet;
8use memchr::memmem;
9use once_cell::sync::Lazy;
10use std::collections::HashSet;
11use std::path::{Path, PathBuf};
12
13/// Cold file extensions that should be filtered out early
14static COLD_EXTENSIONS: Lazy<FxHashSet<&'static str>> = Lazy::new(|| {
15    [
16        // Documentation that's rarely code-relevant
17        "md", "txt", "rst", "adoc", "wiki", // Media files
18        "png", "jpg", "jpeg", "gif", "bmp", "ico", "svg", "webp", "tiff", "mp3", "mp4", "avi",
19        "mkv", "mov", "wmv", "flv", "webm", "m4v", "wav", "flac", "ogg", "aac", "wma",
20        // Archives and packages
21        "zip", "tar", "gz", "bz2", "xz", "7z", "rar", "jar", "war", "ear",
22        // Binary executables
23        "exe", "dll", "so", "dylib", "a", "lib", "bin", "out", // Office documents
24        "pdf", "doc", "docx", "xls", "xlsx", "ppt", "pptx", "odt", "ods", "odp",
25        // Fonts
26        "ttf", "otf", "woff", "woff2", "eot", // Cache/temp files
27        "tmp", "temp", "cache", "log", "bak", "swp", "swo", // Generated/minified
28        "min.js", "min.css",
29    ]
30    .into_iter()
31    .collect()
32});
33
34/// Hot file extensions that are likely to contain important code
35static HOT_EXTENSIONS: Lazy<FxHashSet<&'static str>> = Lazy::new(|| {
36    [
37        // Core programming languages
38        "rs",
39        "py",
40        "js",
41        "ts",
42        "jsx",
43        "tsx",
44        "go",
45        "java",
46        "c",
47        "cpp",
48        "h",
49        "hpp",
50        "cs",
51        "php",
52        "rb",
53        "swift",
54        "kt",
55        "scala",
56        "clj",
57        "hs",
58        "elm",
59        "ml",
60        "ocaml",
61        // Configuration and markup with logic
62        "json",
63        "yaml",
64        "yml",
65        "toml",
66        "xml",
67        "html",
68        "css",
69        "scss",
70        "less",
71        "sass",
72        // Scripts and configs
73        "sh",
74        "bash",
75        "zsh",
76        "fish",
77        "ps1",
78        "cmd",
79        "bat",
80        "dockerfile",
81        "makefile",
82        // Database and query languages
83        "sql",
84        "graphql",
85        "prisma",
86    ]
87    .into_iter()
88    .collect()
89});
90
91/// Vendor/generated directory patterns to skip entirely
92static COLD_DIRS: Lazy<FxHashSet<&'static str>> = Lazy::new(|| {
93    [
94        "node_modules",
95        "__pycache__",
96        ".pytest_cache",
97        ".mypy_cache",
98        "target",
99        "build",
100        "dist",
101        ".git",
102        ".hg",
103        ".svn",
104        "vendor",
105        "third_party",
106        "external",
107        "deps",
108        ".idea",
109        ".vscode",
110        ".vs",
111        ".gradle",
112        ".maven",
113        "coverage",
114        ".coverage",
115        ".nyc_output",
116        "logs",
117        "tmp",
118        "temp",
119        ".tmp",
120        ".temp",
121    ]
122    .into_iter()
123    .collect()
124});
125
126/// Binary content detection patterns (first 512 bytes)
127static BINARY_MARKERS: Lazy<Vec<&'static [u8]>> = Lazy::new(|| {
128    vec![
129        b"\x7fELF",          // ELF binaries
130        b"MZ",               // Windows PE
131        b"\xca\xfe\xba\xbe", // Java class files
132        b"\xfe\xed\xfa\xce", // Mach-O binaries
133        b"\x89PNG",          // PNG images
134        b"\xff\xd8\xff",     // JPEG images
135        b"GIF8",             // GIF images
136        b"RIFF",             // WAV/AVI files
137        b"%PDF",             // PDF files
138        b"PK\x03\x04",       // ZIP files
139    ]
140});
141
142/// Maximum file size for content-based analysis (8MB)
143const MAX_CONTENT_SIZE: u64 = 8 * 1024 * 1024;
144
145/// Size for binary detection sample (512 bytes)
146const BINARY_SAMPLE_SIZE: usize = 512;
147
148/// High-performance file filter with strict pre-filtering
149#[derive(Debug)]
150pub struct FileFilter {
151    /// Custom extension allowlist (if set, only these are allowed)
152    allow_extensions: Option<FxHashSet<String>>,
153    /// Custom extension denylist (these are always blocked)  
154    deny_extensions: FxHashSet<String>,
155    /// Maximum file size to process
156    max_file_size: u64,
157    /// Whether to include hidden files
158    include_hidden: bool,
159    /// Whether to perform binary content detection
160    binary_detection: bool,
161    /// Performance counters
162    stats: FilterStats,
163}
164
165/// Performance statistics for filtering operations
166#[derive(Debug, Default, Clone)]
167pub struct FilterStats {
168    pub files_walked: u64,
169    pub dirs_skipped: u64,
170    pub extension_filtered: u64,
171    pub size_filtered: u64,
172    pub binary_filtered: u64,
173    pub passed_filter: u64,
174    pub bytes_read_for_detection: u64,
175}
176
177/// Result of pre-filtering a single file
178#[derive(Debug, Clone, PartialEq, Eq)]
179pub enum FilterResult {
180    /// File should be processed
181    Include,
182    /// File should be skipped with reason
183    Exclude(FilterReason),
184}
185
186/// Reasons for filtering out files
187#[derive(Debug, Clone, PartialEq, Eq)]
188pub enum FilterReason {
189    ColdExtension,
190    ColdDirectory,
191    TooLarge(u64),
192    Hidden,
193    Binary,
194    CustomExtensionFilter,
195}
196
197impl FileFilter {
198    /// Create a new file filter with performance-optimized defaults
199    pub fn new() -> Self {
200        Self {
201            allow_extensions: None,
202            deny_extensions: FxHashSet::default(),
203            max_file_size: MAX_CONTENT_SIZE,
204            include_hidden: false,
205            binary_detection: true,
206            stats: FilterStats::default(),
207        }
208    }
209
210    /// Set custom extension allowlist (only these extensions will be processed)
211    pub fn with_allow_extensions(mut self, extensions: Vec<String>) -> Self {
212        self.allow_extensions = Some(extensions.into_iter().map(|e| e.to_lowercase()).collect());
213        self
214    }
215
216    /// Add extensions to the deny list
217    pub fn with_deny_extensions(mut self, extensions: Vec<String>) -> Self {
218        self.deny_extensions = extensions.into_iter().map(|e| e.to_lowercase()).collect();
219        self
220    }
221
222    /// Set maximum file size
223    pub fn with_max_file_size(mut self, size: u64) -> Self {
224        self.max_file_size = size;
225        self
226    }
227
228    /// Set whether to include hidden files
229    pub fn with_include_hidden(mut self, include: bool) -> Self {
230        self.include_hidden = include;
231        self
232    }
233
234    /// Set whether to perform binary detection
235    pub fn with_binary_detection(mut self, detect: bool) -> Self {
236        self.binary_detection = detect;
237        self
238    }
239
240    /// Pre-filter a file path without reading contents
241    pub fn pre_filter_path(&mut self, path: &Path) -> FilterResult {
242        self.stats.files_walked += 1;
243
244        // Check hidden files
245        if !self.include_hidden {
246            if let Some(name) = path.file_name() {
247                if name.to_string_lossy().starts_with('.') {
248                    return FilterResult::Exclude(FilterReason::Hidden);
249                }
250            }
251        }
252
253        // Check for cold directories in path
254        for component in path.components() {
255            if let std::path::Component::Normal(name) = component {
256                if COLD_DIRS.contains(name.to_str().unwrap_or("")) {
257                    self.stats.dirs_skipped += 1;
258                    return FilterResult::Exclude(FilterReason::ColdDirectory);
259                }
260            }
261        }
262
263        // Get file extension
264        let extension = path
265            .extension()
266            .and_then(|ext| ext.to_str())
267            .unwrap_or("")
268            .to_lowercase();
269
270        // Apply custom extension filters
271        if let Some(ref allow_list) = self.allow_extensions {
272            if !allow_list.contains(&extension) {
273                self.stats.extension_filtered += 1;
274                return FilterResult::Exclude(FilterReason::CustomExtensionFilter);
275            }
276        }
277
278        if self.deny_extensions.contains(&extension) {
279            self.stats.extension_filtered += 1;
280            return FilterResult::Exclude(FilterReason::CustomExtensionFilter);
281        }
282
283        // Check against cold extensions
284        if COLD_EXTENSIONS.contains(extension.as_str()) {
285            self.stats.extension_filtered += 1;
286            return FilterResult::Exclude(FilterReason::ColdExtension);
287        }
288
289        FilterResult::Include
290    }
291
292    /// Full filter including file size and binary detection
293    pub async fn filter_file(&mut self, path: &Path) -> FilterResult {
294        // First apply path-based filtering
295        match self.pre_filter_path(path) {
296            FilterResult::Exclude(reason) => return FilterResult::Exclude(reason),
297            FilterResult::Include => {}
298        }
299
300        // Check file size
301        if let Ok(metadata) = tokio::fs::metadata(path).await {
302            if metadata.len() > self.max_file_size {
303                self.stats.size_filtered += 1;
304                return FilterResult::Exclude(FilterReason::TooLarge(metadata.len()));
305            }
306
307            // Binary detection if enabled
308            if self.binary_detection && self.should_check_binary(path) {
309                if self.is_binary_file(path).await {
310                    self.stats.binary_filtered += 1;
311                    return FilterResult::Exclude(FilterReason::Binary);
312                }
313            }
314        }
315
316        self.stats.passed_filter += 1;
317        FilterResult::Include
318    }
319
320    /// Check if we should perform binary detection for this file
321    fn should_check_binary(&self, path: &Path) -> bool {
322        let extension = path
323            .extension()
324            .and_then(|ext| ext.to_str())
325            .unwrap_or("")
326            .to_lowercase();
327
328        // Skip binary detection for known text extensions
329        if HOT_EXTENSIONS.contains(extension.as_str()) {
330            return false;
331        }
332
333        // Skip for files with no extension (often text)
334        if extension.is_empty() {
335            return false;
336        }
337
338        true
339    }
340
341    /// Fast binary file detection using content sampling
342    pub async fn is_binary_file(&mut self, path: &Path) -> bool {
343        match tokio::fs::File::open(path).await {
344            Ok(mut file) => {
345                use tokio::io::AsyncReadExt;
346
347                let mut buffer = vec![0u8; BINARY_SAMPLE_SIZE];
348                match file.read(&mut buffer).await {
349                    Ok(bytes_read) => {
350                        self.stats.bytes_read_for_detection += bytes_read as u64;
351                        buffer.truncate(bytes_read);
352
353                        self.detect_binary_content(&buffer)
354                    }
355                    Err(_) => false, // Assume text if we can't read
356                }
357            }
358            Err(_) => false, // Assume text if we can't open
359        }
360    }
361
362    /// Detect binary content using multiple heuristics
363    fn detect_binary_content(&self, content: &[u8]) -> bool {
364        // Check for known binary markers
365        for marker in BINARY_MARKERS.iter() {
366            if content.starts_with(marker) {
367                return true;
368            }
369        }
370
371        // Null byte check (classic binary detection)
372        if memchr::memchr(0, content).is_some() {
373            return true;
374        }
375
376        // High percentage of non-printable bytes
377        let non_printable = content
378            .iter()
379            .filter(|&&b| b < 32 && b != b'\t' && b != b'\n' && b != b'\r')
380            .count();
381
382        let ratio = non_printable as f64 / content.len() as f64;
383        ratio > 0.05 // More than 5% non-printable
384    }
385
386    /// Get filtering statistics
387    pub fn stats(&self) -> &FilterStats {
388        &self.stats
389    }
390
391    /// Reset statistics
392    pub fn reset_stats(&mut self) {
393        self.stats = FilterStats::default();
394    }
395}
396
397impl Default for FileFilter {
398    fn default() -> Self {
399        Self::new()
400    }
401}
402
403/// Directory-level filtering for efficient tree traversal
404#[derive(Debug)]
405pub struct DirectoryFilter {
406    cold_dirs: FxHashSet<String>,
407    stats: DirectoryFilterStats,
408}
409
410#[derive(Debug, Default)]
411pub struct DirectoryFilterStats {
412    pub dirs_walked: u64,
413    pub dirs_skipped: u64,
414}
415
416impl DirectoryFilter {
417    pub fn new() -> Self {
418        Self {
419            cold_dirs: COLD_DIRS.iter().map(|s| s.to_string()).collect(),
420            stats: DirectoryFilterStats::default(),
421        }
422    }
423
424    pub fn with_additional_cold_dirs(mut self, dirs: Vec<String>) -> Self {
425        self.cold_dirs.extend(dirs);
426        self
427    }
428
429    /// Check if a directory should be skipped entirely
430    pub fn should_skip_directory(&mut self, path: &Path) -> bool {
431        self.stats.dirs_walked += 1;
432
433        if let Some(name) = path.file_name() {
434            if let Some(name_str) = name.to_str() {
435                if self.cold_dirs.contains(name_str) {
436                    self.stats.dirs_skipped += 1;
437                    return true;
438                }
439            }
440        }
441
442        false
443    }
444
445    pub fn stats(&self) -> &DirectoryFilterStats {
446        &self.stats
447    }
448}
449
450impl Default for DirectoryFilter {
451    fn default() -> Self {
452        Self::new()
453    }
454}
455
456#[cfg(test)]
457mod tests {
458    use super::*;
459    use tempfile::TempDir;
460    use tokio::fs;
461
462    #[tokio::test]
463    async fn test_cold_extension_filtering() {
464        let mut filter = FileFilter::new();
465
466        assert_eq!(
467            filter.pre_filter_path(Path::new("test.png")),
468            FilterResult::Exclude(FilterReason::ColdExtension)
469        );
470
471        assert_eq!(
472            filter.pre_filter_path(Path::new("code.rs")),
473            FilterResult::Include
474        );
475    }
476
477    #[tokio::test]
478    async fn test_cold_directory_filtering() {
479        let mut filter = FileFilter::new();
480
481        assert_eq!(
482            filter.pre_filter_path(Path::new("node_modules/package/index.js")),
483            FilterResult::Exclude(FilterReason::ColdDirectory)
484        );
485
486        assert_eq!(
487            filter.pre_filter_path(Path::new("src/main.rs")),
488            FilterResult::Include
489        );
490    }
491
492    #[tokio::test]
493    async fn test_custom_extension_filtering() {
494        let mut filter =
495            FileFilter::new().with_allow_extensions(vec!["rs".to_string(), "py".to_string()]);
496
497        assert_eq!(
498            filter.pre_filter_path(Path::new("test.js")),
499            FilterResult::Exclude(FilterReason::CustomExtensionFilter)
500        );
501
502        assert_eq!(
503            filter.pre_filter_path(Path::new("test.rs")),
504            FilterResult::Include
505        );
506    }
507
508    #[tokio::test]
509    async fn test_file_size_filtering() {
510        // Create test file in current directory to avoid tmp path issues
511        // Use .rs extension which is in HOT_EXTENSIONS, not COLD_EXTENSIONS
512        let large_file = Path::new("test_large_file.rs");
513
514        // Create a file larger than 1KB
515        let content = "x".repeat(2000);
516        fs::write(&large_file, &content).await.unwrap();
517
518        let mut filter = FileFilter::new().with_max_file_size(1000);
519
520        let result = filter.filter_file(&large_file).await;
521
522        // Clean up test file
523        let _ = fs::remove_file(&large_file).await;
524
525        match result {
526            FilterResult::Exclude(FilterReason::TooLarge(_)) => {}
527            other => panic!("Expected TooLarge, got {:?}", other),
528        }
529    }
530
531    #[tokio::test]
532    async fn test_binary_detection() {
533        let temp_dir = TempDir::new().unwrap();
534
535        // Create a subdirectory that won't match COLD_DIRS
536        let test_dir = temp_dir.path().join("project");
537        fs::create_dir_all(&test_dir).await.unwrap();
538
539        // Create a binary file with null bytes
540        let binary_file = test_dir.join("binary.dat");
541        fs::write(&binary_file, &[0u8, 1u8, 2u8, 0u8])
542            .await
543            .unwrap();
544
545        // Create a text file
546        let text_file = test_dir.join("text.txt");
547        fs::write(&text_file, "Hello, world!").await.unwrap();
548
549        let mut filter = FileFilter::new();
550
551        // Test that binary files are detected correctly
552        // Since the temp dir path contains "tmp", we need to test binary detection
553        // on files that don't get filtered by cold directory first
554        assert!(filter.is_binary_file(&binary_file).await);
555        assert!(!filter.is_binary_file(&text_file).await);
556    }
557
558    #[tokio::test]
559    async fn test_hidden_file_filtering() {
560        let mut filter = FileFilter::new().with_include_hidden(false);
561
562        assert_eq!(
563            filter.pre_filter_path(Path::new(".hidden")),
564            FilterResult::Exclude(FilterReason::Hidden)
565        );
566
567        let mut filter = FileFilter::new().with_include_hidden(true);
568
569        assert_eq!(
570            filter.pre_filter_path(Path::new(".hidden")),
571            FilterResult::Include
572        );
573    }
574
575    #[test]
576    fn test_binary_content_detection() {
577        let filter = FileFilter::new();
578
579        // ELF binary
580        assert!(filter.detect_binary_content(b"\x7fELF\x01\x01\x01"));
581
582        // PDF file
583        assert!(filter.detect_binary_content(b"%PDF-1.4\n"));
584
585        // File with null bytes
586        assert!(filter.detect_binary_content(b"text\x00more text"));
587
588        // Regular text
589        assert!(!filter.detect_binary_content(b"Hello, world!\n"));
590
591        // Text with tabs and newlines
592        assert!(!filter.detect_binary_content(b"fn main() {\n\tprintln!(\"Hello\");\n}"));
593    }
594
595    #[test]
596    fn test_directory_filtering() {
597        let mut dir_filter = DirectoryFilter::new();
598
599        assert!(dir_filter.should_skip_directory(Path::new("node_modules")));
600        assert!(dir_filter.should_skip_directory(Path::new("target")));
601        assert!(!dir_filter.should_skip_directory(Path::new("src")));
602
603        assert_eq!(dir_filter.stats().dirs_walked, 3);
604        assert_eq!(dir_filter.stats().dirs_skipped, 2);
605    }
606
607    #[test]
608    fn test_filter_statistics() {
609        let mut filter = FileFilter::new();
610
611        // Test various filtering scenarios
612        filter.pre_filter_path(Path::new("test.rs")); // Include
613        filter.pre_filter_path(Path::new("test.png")); // Cold extension
614        filter.pre_filter_path(Path::new("node_modules/pkg/index.js")); // Cold dir
615        filter.pre_filter_path(Path::new(".hidden")); // Hidden
616
617        let stats = filter.stats();
618        assert_eq!(stats.files_walked, 4);
619        assert_eq!(stats.extension_filtered, 1);
620        assert_eq!(stats.dirs_skipped, 1);
621        assert_eq!(stats.passed_filter, 0); // pre_filter_path doesn't update passed_filter
622    }
623}