seekr_code/scanner/
filter.rs1use std::path::Path;
7
8const BINARY_CHECK_SIZE: usize = 8192;
10
11const BINARY_NULL_THRESHOLD: f64 = 0.01;
13
14const SOURCE_EXTENSIONS: &[&str] = &[
16 "rs",
18 "py", "pyi", "pyx",
20 "js", "jsx", "mjs", "cjs", "ts", "tsx", "mts", "cts",
22 "go",
24 "java", "kt", "kts",
26 "c", "h", "cc", "cpp", "cxx", "hpp", "hxx",
28 "cs",
30 "rb",
32 "php",
34 "swift",
36 "scala",
38 "sh", "bash", "zsh", "fish",
40 "html", "htm", "css", "scss", "sass", "less",
42 "json", "yaml", "yml", "toml", "xml", "ini", "cfg",
44 "md", "rst", "txt",
46 "sql",
48 "lua",
50 "dart",
52 "ex", "exs", "erl",
54 "hs",
56 "ml", "mli",
58 "zig",
60 "proto",
62 "dockerfile",
64 "makefile",
66];
67
68pub fn is_binary_file(path: &Path) -> bool {
73 match std::fs::read(path) {
74 Ok(content) => is_binary_content(&content),
75 Err(_) => true, }
77}
78
79pub fn is_binary_content(content: &[u8]) -> bool {
83 let check_len = content.len().min(BINARY_CHECK_SIZE);
84 if check_len == 0 {
85 return false; }
87
88 let slice = &content[..check_len];
89 let null_count = slice.iter().filter(|&&b| b == 0).count();
90 let null_ratio = null_count as f64 / check_len as f64;
91
92 null_ratio > BINARY_NULL_THRESHOLD
93}
94
95pub fn is_source_file(path: &Path) -> bool {
97 if let Some(filename) = path.file_name().and_then(|f| f.to_str()) {
99 let lower = filename.to_lowercase();
100 if matches!(
101 lower.as_str(),
102 "makefile" | "dockerfile" | "rakefile" | "gemfile" | "cmakelists.txt"
103 ) {
104 return true;
105 }
106 }
107
108 path.extension()
110 .and_then(|ext| ext.to_str())
111 .map(|ext| {
112 let lower = ext.to_lowercase();
113 SOURCE_EXTENSIONS.contains(&lower.as_str())
114 })
115 .unwrap_or(false)
116}
117
118pub fn should_index_file(path: &Path, size: u64, max_file_size: u64) -> bool {
125 if size > max_file_size {
127 tracing::debug!(path = %path.display(), size, max = max_file_size, "Skipping oversized file");
128 return false;
129 }
130
131 if !is_source_file(path) {
133 return false;
134 }
135
136 true
137}
138
139#[cfg(test)]
140mod tests {
141 use super::*;
142
143 #[test]
144 fn test_binary_detection() {
145 assert!(!is_binary_content(b"Hello, world!\nThis is text."));
147
148 let mut binary = vec![0u8; 1000];
150 binary[0] = b'E';
151 binary[1] = b'L';
152 binary[2] = b'F';
153 assert!(is_binary_content(&binary));
154
155 assert!(!is_binary_content(b""));
157 }
158
159 #[test]
160 fn test_source_file_detection() {
161 assert!(is_source_file(Path::new("main.rs")));
162 assert!(is_source_file(Path::new("app.py")));
163 assert!(is_source_file(Path::new("index.ts")));
164 assert!(is_source_file(Path::new("main.go")));
165 assert!(is_source_file(Path::new("config.toml")));
166 assert!(is_source_file(Path::new("Makefile")));
167
168 assert!(!is_source_file(Path::new("image.png")));
170 assert!(!is_source_file(Path::new("data.bin")));
171 assert!(!is_source_file(Path::new("archive.tar.gz")));
172 }
173
174 #[test]
175 fn test_should_index_file() {
176 let max_size = 10 * 1024 * 1024; assert!(should_index_file(Path::new("main.rs"), 1000, max_size));
179 assert!(!should_index_file(Path::new("main.rs"), max_size + 1, max_size));
180 assert!(!should_index_file(Path::new("image.png"), 1000, max_size));
181 }
182}