use std::path::Path;
const BINARY_CHECK_SIZE: usize = 8192;
const BINARY_NULL_THRESHOLD: f64 = 0.01;
const SOURCE_EXTENSIONS: &[&str] = &[
"rs",
"py", "pyi", "pyx",
"js", "jsx", "mjs", "cjs", "ts", "tsx", "mts", "cts",
"go",
"java", "kt", "kts",
"c", "h", "cc", "cpp", "cxx", "hpp", "hxx",
"cs",
"rb",
"php",
"swift",
"scala",
"sh", "bash", "zsh", "fish",
"html", "htm", "css", "scss", "sass", "less",
"json", "yaml", "yml", "toml", "xml", "ini", "cfg",
"md", "rst", "txt",
"sql",
"lua",
"dart",
"ex", "exs", "erl",
"hs",
"ml", "mli",
"zig",
"proto",
"dockerfile",
"makefile",
];
pub fn is_binary_file(path: &Path) -> bool {
match std::fs::read(path) {
Ok(content) => is_binary_content(&content),
Err(_) => true, }
}
pub fn is_binary_content(content: &[u8]) -> bool {
let check_len = content.len().min(BINARY_CHECK_SIZE);
if check_len == 0 {
return false; }
let slice = &content[..check_len];
let null_count = slice.iter().filter(|&&b| b == 0).count();
let null_ratio = null_count as f64 / check_len as f64;
null_ratio > BINARY_NULL_THRESHOLD
}
pub fn is_source_file(path: &Path) -> bool {
if let Some(filename) = path.file_name().and_then(|f| f.to_str()) {
let lower = filename.to_lowercase();
if matches!(
lower.as_str(),
"makefile" | "dockerfile" | "rakefile" | "gemfile" | "cmakelists.txt"
) {
return true;
}
}
path.extension()
.and_then(|ext| ext.to_str())
.map(|ext| {
let lower = ext.to_lowercase();
SOURCE_EXTENSIONS.contains(&lower.as_str())
})
.unwrap_or(false)
}
pub fn should_index_file(path: &Path, size: u64, max_file_size: u64) -> bool {
if size > max_file_size {
tracing::debug!(path = %path.display(), size, max = max_file_size, "Skipping oversized file");
return false;
}
if !is_source_file(path) {
return false;
}
true
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_binary_detection() {
assert!(!is_binary_content(b"Hello, world!\nThis is text."));
let mut binary = vec![0u8; 1000];
binary[0] = b'E';
binary[1] = b'L';
binary[2] = b'F';
assert!(is_binary_content(&binary));
assert!(!is_binary_content(b""));
}
#[test]
fn test_source_file_detection() {
assert!(is_source_file(Path::new("main.rs")));
assert!(is_source_file(Path::new("app.py")));
assert!(is_source_file(Path::new("index.ts")));
assert!(is_source_file(Path::new("main.go")));
assert!(is_source_file(Path::new("config.toml")));
assert!(is_source_file(Path::new("Makefile")));
assert!(!is_source_file(Path::new("image.png")));
assert!(!is_source_file(Path::new("data.bin")));
assert!(!is_source_file(Path::new("archive.tar.gz")));
}
#[test]
fn test_should_index_file() {
let max_size = 10 * 1024 * 1024;
assert!(should_index_file(Path::new("main.rs"), 1000, max_size));
assert!(!should_index_file(Path::new("main.rs"), max_size + 1, max_size));
assert!(!should_index_file(Path::new("image.png"), 1000, max_size));
}
}