seekr_code/scanner/
filter.rs1use std::path::Path;
7
8const BINARY_CHECK_SIZE: usize = 8192;
10
11const BINARY_NULL_THRESHOLD: f64 = 0.01;
13
14const SOURCE_EXTENSIONS: &[&str] = &[
16 "rs",
18 "py",
20 "pyi",
21 "pyx",
22 "js",
24 "jsx",
25 "mjs",
26 "cjs",
27 "ts",
28 "tsx",
29 "mts",
30 "cts",
31 "go",
33 "java",
35 "kt",
36 "kts",
37 "c",
39 "h",
40 "cc",
41 "cpp",
42 "cxx",
43 "hpp",
44 "hxx",
45 "cs",
47 "rb",
49 "php",
51 "swift",
53 "scala",
55 "sh",
57 "bash",
58 "zsh",
59 "fish",
60 "html",
62 "htm",
63 "css",
64 "scss",
65 "sass",
66 "less",
67 "json",
69 "yaml",
70 "yml",
71 "toml",
72 "xml",
73 "ini",
74 "cfg",
75 "md",
77 "rst",
78 "txt",
79 "sql",
81 "lua",
83 "dart",
85 "ex",
87 "exs",
88 "erl",
89 "hs",
91 "ml",
93 "mli",
94 "zig",
96 "proto",
98 "dockerfile",
100 "makefile",
102];
103
104pub fn is_binary_file(path: &Path) -> bool {
109 match std::fs::read(path) {
110 Ok(content) => is_binary_content(&content),
111 Err(_) => true, }
113}
114
115pub fn is_binary_content(content: &[u8]) -> bool {
119 let check_len = content.len().min(BINARY_CHECK_SIZE);
120 if check_len == 0 {
121 return false; }
123
124 let slice = &content[..check_len];
125 let null_count = slice.iter().filter(|&&b| b == 0).count();
126 let null_ratio = null_count as f64 / check_len as f64;
127
128 null_ratio > BINARY_NULL_THRESHOLD
129}
130
131pub fn is_source_file(path: &Path) -> bool {
133 if let Some(filename) = path.file_name().and_then(|f| f.to_str()) {
135 let lower = filename.to_lowercase();
136 if matches!(
137 lower.as_str(),
138 "makefile" | "dockerfile" | "rakefile" | "gemfile" | "cmakelists.txt"
139 ) {
140 return true;
141 }
142 }
143
144 path.extension()
146 .and_then(|ext| ext.to_str())
147 .map(|ext| {
148 let lower = ext.to_lowercase();
149 SOURCE_EXTENSIONS.contains(&lower.as_str())
150 })
151 .unwrap_or(false)
152}
153
154pub fn should_index_file(path: &Path, size: u64, max_file_size: u64) -> bool {
161 if size > max_file_size {
163 tracing::debug!(path = %path.display(), size, max = max_file_size, "Skipping oversized file");
164 return false;
165 }
166
167 if !is_source_file(path) {
169 return false;
170 }
171
172 true
173}
174
175#[cfg(test)]
176mod tests {
177 use super::*;
178
179 #[test]
180 fn test_binary_detection() {
181 assert!(!is_binary_content(b"Hello, world!\nThis is text."));
183
184 let mut binary = vec![0u8; 1000];
186 binary[0] = b'E';
187 binary[1] = b'L';
188 binary[2] = b'F';
189 assert!(is_binary_content(&binary));
190
191 assert!(!is_binary_content(b""));
193 }
194
195 #[test]
196 fn test_source_file_detection() {
197 assert!(is_source_file(Path::new("main.rs")));
198 assert!(is_source_file(Path::new("app.py")));
199 assert!(is_source_file(Path::new("index.ts")));
200 assert!(is_source_file(Path::new("main.go")));
201 assert!(is_source_file(Path::new("config.toml")));
202 assert!(is_source_file(Path::new("Makefile")));
203
204 assert!(!is_source_file(Path::new("image.png")));
206 assert!(!is_source_file(Path::new("data.bin")));
207 assert!(!is_source_file(Path::new("archive.tar.gz")));
208 }
209
210 #[test]
211 fn test_should_index_file() {
212 let max_size = 10 * 1024 * 1024; assert!(should_index_file(Path::new("main.rs"), 1000, max_size));
215 assert!(!should_index_file(
216 Path::new("main.rs"),
217 max_size + 1,
218 max_size
219 ));
220 assert!(!should_index_file(Path::new("image.png"), 1000, max_size));
221 }
222}