Skip to main content

seekr_code/scanner/
filter.rs

1//! File filter module.
2//!
3//! Provides binary file detection, file size limits,
4//! and file type whitelist/blacklist filtering.
5
6use std::path::Path;
7
8/// Maximum number of bytes to check for binary detection.
9const BINARY_CHECK_SIZE: usize = 8192;
10
11/// Percentage of null bytes threshold for binary detection.
12const BINARY_NULL_THRESHOLD: f64 = 0.01;
13
14/// Known source code file extensions.
15const SOURCE_EXTENSIONS: &[&str] = &[
16    // Rust
17    "rs",
18    // Python
19    "py",
20    "pyi",
21    "pyx",
22    // JavaScript / TypeScript
23    "js",
24    "jsx",
25    "mjs",
26    "cjs",
27    "ts",
28    "tsx",
29    "mts",
30    "cts",
31    // Go
32    "go",
33    // Java / Kotlin
34    "java",
35    "kt",
36    "kts",
37    // C / C++
38    "c",
39    "h",
40    "cc",
41    "cpp",
42    "cxx",
43    "hpp",
44    "hxx",
45    // C#
46    "cs",
47    // Ruby
48    "rb",
49    // PHP
50    "php",
51    // Swift
52    "swift",
53    // Scala
54    "scala",
55    // Shell
56    "sh",
57    "bash",
58    "zsh",
59    "fish",
60    // Web
61    "html",
62    "htm",
63    "css",
64    "scss",
65    "sass",
66    "less",
67    // Data / Config
68    "json",
69    "yaml",
70    "yml",
71    "toml",
72    "xml",
73    "ini",
74    "cfg",
75    // Markdown / Docs
76    "md",
77    "rst",
78    "txt",
79    // SQL
80    "sql",
81    // Lua
82    "lua",
83    // Dart
84    "dart",
85    // Elixir / Erlang
86    "ex",
87    "exs",
88    "erl",
89    // Haskell
90    "hs",
91    // OCaml
92    "ml",
93    "mli",
94    // Zig
95    "zig",
96    // Protobuf
97    "proto",
98    // Dockerfile
99    "dockerfile",
100    // Makefile
101    "makefile",
102];
103
104/// Check if a file is likely a binary file by examining its content.
105///
106/// Reads up to the first 8KB and checks for null bytes.
107/// Returns `true` if the file appears to be binary.
108pub fn is_binary_file(path: &Path) -> bool {
109    match std::fs::read(path) {
110        Ok(content) => is_binary_content(&content),
111        Err(_) => true, // If we can't read it, treat as binary (skip it)
112    }
113}
114
115/// Check if content appears to be binary.
116///
117/// Uses null-byte heuristic on the first 8KB.
118pub fn is_binary_content(content: &[u8]) -> bool {
119    let check_len = content.len().min(BINARY_CHECK_SIZE);
120    if check_len == 0 {
121        return false; // Empty files are not binary
122    }
123
124    let slice = &content[..check_len];
125    let null_count = slice.iter().filter(|&&b| b == 0).count();
126    let null_ratio = null_count as f64 / check_len as f64;
127
128    null_ratio > BINARY_NULL_THRESHOLD
129}
130
131/// Check if a file has a recognized source code extension.
132pub fn is_source_file(path: &Path) -> bool {
133    // Check for special filenames (no extension)
134    if let Some(filename) = path.file_name().and_then(|f| f.to_str()) {
135        let lower = filename.to_lowercase();
136        if matches!(
137            lower.as_str(),
138            "makefile" | "dockerfile" | "rakefile" | "gemfile" | "cmakelists.txt"
139        ) {
140            return true;
141        }
142    }
143
144    // Check extension
145    path.extension()
146        .and_then(|ext| ext.to_str())
147        .map(|ext| {
148            let lower = ext.to_lowercase();
149            SOURCE_EXTENSIONS.contains(&lower.as_str())
150        })
151        .unwrap_or(false)
152}
153
154/// Check if a file should be included for indexing.
155///
156/// A file passes if:
157/// 1. It has a recognized source code extension
158/// 2. It is not too large (checked by size, not reading content)
159/// 3. It is not binary (checked lazily, only if other checks pass)
160pub fn should_index_file(path: &Path, size: u64, max_file_size: u64) -> bool {
161    // Size check first (cheapest)
162    if size > max_file_size {
163        tracing::debug!(path = %path.display(), size, max = max_file_size, "Skipping oversized file");
164        return false;
165    }
166
167    // Extension check (cheap)
168    if !is_source_file(path) {
169        return false;
170    }
171
172    true
173}
174
175#[cfg(test)]
176mod tests {
177    use super::*;
178
179    #[test]
180    fn test_binary_detection() {
181        // Text content
182        assert!(!is_binary_content(b"Hello, world!\nThis is text."));
183
184        // Binary content (lots of null bytes)
185        let mut binary = vec![0u8; 1000];
186        binary[0] = b'E';
187        binary[1] = b'L';
188        binary[2] = b'F';
189        assert!(is_binary_content(&binary));
190
191        // Empty content
192        assert!(!is_binary_content(b""));
193    }
194
195    #[test]
196    fn test_source_file_detection() {
197        assert!(is_source_file(Path::new("main.rs")));
198        assert!(is_source_file(Path::new("app.py")));
199        assert!(is_source_file(Path::new("index.ts")));
200        assert!(is_source_file(Path::new("main.go")));
201        assert!(is_source_file(Path::new("config.toml")));
202        assert!(is_source_file(Path::new("Makefile")));
203
204        // Non-source files
205        assert!(!is_source_file(Path::new("image.png")));
206        assert!(!is_source_file(Path::new("data.bin")));
207        assert!(!is_source_file(Path::new("archive.tar.gz")));
208    }
209
210    #[test]
211    fn test_should_index_file() {
212        let max_size = 10 * 1024 * 1024; // 10 MB
213
214        assert!(should_index_file(Path::new("main.rs"), 1000, max_size));
215        assert!(!should_index_file(
216            Path::new("main.rs"),
217            max_size + 1,
218            max_size
219        ));
220        assert!(!should_index_file(Path::new("image.png"), 1000, max_size));
221    }
222}