Skip to main content

seekr_code/scanner/
filter.rs

1//! File filter module.
2//!
3//! Provides binary file detection, file size limits,
4//! and file type whitelist/blacklist filtering.
5
6use std::path::Path;
7
8/// Maximum number of bytes to check for binary detection.
9const BINARY_CHECK_SIZE: usize = 8192;
10
11/// Percentage of null bytes threshold for binary detection.
12const BINARY_NULL_THRESHOLD: f64 = 0.01;
13
14/// Known source code file extensions.
15const SOURCE_EXTENSIONS: &[&str] = &[
16    // Rust
17    "rs",
18    // Python
19    "py", "pyi", "pyx",
20    // JavaScript / TypeScript
21    "js", "jsx", "mjs", "cjs", "ts", "tsx", "mts", "cts",
22    // Go
23    "go",
24    // Java / Kotlin
25    "java", "kt", "kts",
26    // C / C++
27    "c", "h", "cc", "cpp", "cxx", "hpp", "hxx",
28    // C#
29    "cs",
30    // Ruby
31    "rb",
32    // PHP
33    "php",
34    // Swift
35    "swift",
36    // Scala
37    "scala",
38    // Shell
39    "sh", "bash", "zsh", "fish",
40    // Web
41    "html", "htm", "css", "scss", "sass", "less",
42    // Data / Config
43    "json", "yaml", "yml", "toml", "xml", "ini", "cfg",
44    // Markdown / Docs
45    "md", "rst", "txt",
46    // SQL
47    "sql",
48    // Lua
49    "lua",
50    // Dart
51    "dart",
52    // Elixir / Erlang
53    "ex", "exs", "erl",
54    // Haskell
55    "hs",
56    // OCaml
57    "ml", "mli",
58    // Zig
59    "zig",
60    // Protobuf
61    "proto",
62    // Dockerfile
63    "dockerfile",
64    // Makefile
65    "makefile",
66];
67
68/// Check if a file is likely a binary file by examining its content.
69///
70/// Reads up to the first 8KB and checks for null bytes.
71/// Returns `true` if the file appears to be binary.
72pub fn is_binary_file(path: &Path) -> bool {
73    match std::fs::read(path) {
74        Ok(content) => is_binary_content(&content),
75        Err(_) => true, // If we can't read it, treat as binary (skip it)
76    }
77}
78
79/// Check if content appears to be binary.
80///
81/// Uses null-byte heuristic on the first 8KB.
82pub fn is_binary_content(content: &[u8]) -> bool {
83    let check_len = content.len().min(BINARY_CHECK_SIZE);
84    if check_len == 0 {
85        return false; // Empty files are not binary
86    }
87
88    let slice = &content[..check_len];
89    let null_count = slice.iter().filter(|&&b| b == 0).count();
90    let null_ratio = null_count as f64 / check_len as f64;
91
92    null_ratio > BINARY_NULL_THRESHOLD
93}
94
95/// Check if a file has a recognized source code extension.
96pub fn is_source_file(path: &Path) -> bool {
97    // Check for special filenames (no extension)
98    if let Some(filename) = path.file_name().and_then(|f| f.to_str()) {
99        let lower = filename.to_lowercase();
100        if matches!(
101            lower.as_str(),
102            "makefile" | "dockerfile" | "rakefile" | "gemfile" | "cmakelists.txt"
103        ) {
104            return true;
105        }
106    }
107
108    // Check extension
109    path.extension()
110        .and_then(|ext| ext.to_str())
111        .map(|ext| {
112            let lower = ext.to_lowercase();
113            SOURCE_EXTENSIONS.contains(&lower.as_str())
114        })
115        .unwrap_or(false)
116}
117
118/// Check if a file should be included for indexing.
119///
120/// A file passes if:
121/// 1. It has a recognized source code extension
122/// 2. It is not too large (checked by size, not reading content)
123/// 3. It is not binary (checked lazily, only if other checks pass)
124pub fn should_index_file(path: &Path, size: u64, max_file_size: u64) -> bool {
125    // Size check first (cheapest)
126    if size > max_file_size {
127        tracing::debug!(path = %path.display(), size, max = max_file_size, "Skipping oversized file");
128        return false;
129    }
130
131    // Extension check (cheap)
132    if !is_source_file(path) {
133        return false;
134    }
135
136    true
137}
138
139#[cfg(test)]
140mod tests {
141    use super::*;
142
143    #[test]
144    fn test_binary_detection() {
145        // Text content
146        assert!(!is_binary_content(b"Hello, world!\nThis is text."));
147
148        // Binary content (lots of null bytes)
149        let mut binary = vec![0u8; 1000];
150        binary[0] = b'E';
151        binary[1] = b'L';
152        binary[2] = b'F';
153        assert!(is_binary_content(&binary));
154
155        // Empty content
156        assert!(!is_binary_content(b""));
157    }
158
159    #[test]
160    fn test_source_file_detection() {
161        assert!(is_source_file(Path::new("main.rs")));
162        assert!(is_source_file(Path::new("app.py")));
163        assert!(is_source_file(Path::new("index.ts")));
164        assert!(is_source_file(Path::new("main.go")));
165        assert!(is_source_file(Path::new("config.toml")));
166        assert!(is_source_file(Path::new("Makefile")));
167
168        // Non-source files
169        assert!(!is_source_file(Path::new("image.png")));
170        assert!(!is_source_file(Path::new("data.bin")));
171        assert!(!is_source_file(Path::new("archive.tar.gz")));
172    }
173
174    #[test]
175    fn test_should_index_file() {
176        let max_size = 10 * 1024 * 1024; // 10 MB
177
178        assert!(should_index_file(Path::new("main.rs"), 1000, max_size));
179        assert!(!should_index_file(Path::new("main.rs"), max_size + 1, max_size));
180        assert!(!should_index_file(Path::new("image.png"), 1000, max_size));
181    }
182}