Skip to main content

lean_ctx/core/
binary_detect.rs

1use std::path::Path;
2
3const BINARY_EXTENSIONS: &[&str] = &[
4    // Data formats
5    "parquet",
6    "avro",
7    "orc",
8    "arrow",
9    "feather",
10    "hdf5",
11    "h5",
12    "npy",
13    "npz",
14    // Databases
15    "db",
16    "sqlite",
17    "sqlite3",
18    "mdb",
19    "accdb",
20    "ldb",
21    // Archives
22    "zip",
23    "gz",
24    "tar",
25    "bz2",
26    "xz",
27    "7z",
28    "rar",
29    "zst",
30    "lz4",
31    "lzma",
32    // Images
33    "png",
34    "jpg",
35    "jpeg",
36    "gif",
37    "webp",
38    "bmp",
39    "ico",
40    "tiff",
41    "tif",
42    "svg",
43    "psd",
44    "raw",
45    "cr2",
46    "nef",
47    "heic",
48    "heif",
49    "avif",
50    // Audio/Video
51    "mp3",
52    "mp4",
53    "wav",
54    "flac",
55    "ogg",
56    "avi",
57    "mkv",
58    "mov",
59    "webm",
60    "m4a",
61    // Executables/Libraries
62    "exe",
63    "dll",
64    "so",
65    "dylib",
66    "o",
67    "a",
68    "obj",
69    "lib",
70    "pdb",
71    "class",
72    "jar",
73    "war",
74    "ear",
75    // Compiled/Bytecode
76    "pyc",
77    "pyo",
78    "whl",
79    "egg",
80    "beam",
81    "wasm",
82    "wast",
83    // ML models
84    "model",
85    "onnx",
86    "pt",
87    "pth",
88    "safetensors",
89    "gguf",
90    "ggml",
91    "tflite",
92    "pb",
93    "h5",
94    "keras",
95    // Serialized
96    "pkl",
97    "pickle",
98    "bin",
99    "dat",
100    "protobuf",
101    // Documents (binary)
102    "pdf",
103    "doc",
104    "docx",
105    "xls",
106    "xlsx",
107    "ppt",
108    "pptx",
109    "odt",
110    "ods",
111    // Fonts
112    "ttf",
113    "otf",
114    "woff",
115    "woff2",
116    "eot",
117    // Disk images
118    "iso",
119    "img",
120    "vmdk",
121    "qcow2",
122];
123
124/// Fast extension-based binary detection (zero I/O).
125fn has_binary_extension(path: &str) -> bool {
126    Path::new(path)
127        .extension()
128        .and_then(|e| e.to_str())
129        .map(str::to_ascii_lowercase)
130        .is_some_and(|ext| BINARY_EXTENSIONS.contains(&ext.as_str()))
131}
132
133/// Heuristic: read first 8 KB and check for NULL bytes.
134/// Standard method used by `file(1)`, git, etc.
135fn has_binary_content(path: &str) -> bool {
136    let Ok(file) = std::fs::File::open(path) else {
137        return false;
138    };
139    use std::io::Read;
140    let mut buf = [0u8; 8192];
141    let mut reader = std::io::BufReader::new(file);
142    let Ok(n) = reader.read(&mut buf) else {
143        return false;
144    };
145    buf[..n].contains(&0)
146}
147
148/// Returns `true` if the file is likely a binary file.
149/// Checks extension first (zero I/O), falls back to content inspection.
150pub fn is_binary_file(path: &str) -> bool {
151    if has_binary_extension(path) {
152        return true;
153    }
154    has_binary_content(path)
155}
156
157/// Returns a human-readable file type label for common binary extensions.
158fn file_type_label(path: &str) -> &'static str {
159    let ext = Path::new(path)
160        .extension()
161        .and_then(|e| e.to_str())
162        .unwrap_or("");
163    match ext.to_ascii_lowercase().as_str() {
164        "parquet" | "avro" | "orc" | "arrow" | "feather" => "columnar data file",
165        "hdf5" | "h5" | "npy" | "npz" => "scientific data file",
166        "db" | "sqlite" | "sqlite3" => "database file",
167        "zip" | "gz" | "tar" | "bz2" | "xz" | "7z" | "rar" | "zst" => "compressed archive",
168        "png" | "jpg" | "jpeg" | "gif" | "webp" | "bmp" | "ico" | "heic" => "image file",
169        "mp3" | "mp4" | "wav" | "flac" | "ogg" | "avi" | "mkv" | "mov" => "media file",
170        "exe" | "dll" | "so" | "dylib" => "native binary",
171        "wasm" => "WebAssembly binary",
172        "pdf" => "PDF document",
173        "onnx" | "pt" | "pth" | "safetensors" | "gguf" | "ggml" => "ML model file",
174        "pkl" | "pickle" => "serialized object",
175        "pyc" | "pyo" => "Python bytecode",
176        "class" | "jar" | "war" => "Java bytecode",
177        _ => "binary file",
178    }
179}
180
181/// Returns a helpful error message for binary files, including file type and suggestions.
182pub fn binary_file_message(path: &str) -> String {
183    let ext = Path::new(path)
184        .extension()
185        .and_then(|e| e.to_str())
186        .unwrap_or("unknown");
187    let label = file_type_label(path);
188    format!(
189        "Binary file detected (.{ext}, {label}). \
190         lean-ctx cannot read binary files as text. \
191         Use a specialized tool for this file type."
192    )
193}
194
195#[cfg(test)]
196mod tests {
197    use super::*;
198
199    #[test]
200    fn detects_binary_extensions() {
201        assert!(has_binary_extension("data.parquet"));
202        assert!(has_binary_extension("model.onnx"));
203        assert!(has_binary_extension("archive.tar.gz"));
204        assert!(has_binary_extension("photo.PNG"));
205        assert!(has_binary_extension("/path/to/file.sqlite3"));
206    }
207
208    #[test]
209    fn rejects_text_extensions() {
210        assert!(!has_binary_extension("main.rs"));
211        assert!(!has_binary_extension("config.toml"));
212        assert!(!has_binary_extension("README.md"));
213        assert!(!has_binary_extension("script.py"));
214    }
215
216    #[test]
217    fn message_includes_type() {
218        let msg = binary_file_message("data.parquet");
219        assert!(msg.contains("columnar data file"));
220        assert!(msg.contains(".parquet"));
221    }
222
223    #[test]
224    fn message_for_unknown_binary() {
225        let msg = binary_file_message("file.xyz");
226        assert!(msg.contains("binary file"));
227    }
228
229    #[test]
230    fn null_byte_detection() {
231        let dir = std::env::temp_dir().join("lean_ctx_binary_test");
232        std::fs::create_dir_all(&dir).ok();
233
234        let bin_path = dir.join("test.bin");
235        std::fs::write(&bin_path, b"\x00\x01\x02\x03").unwrap();
236        assert!(has_binary_content(bin_path.to_str().unwrap()));
237
238        let txt_path = dir.join("test.txt");
239        std::fs::write(&txt_path, b"hello world").unwrap();
240        assert!(!has_binary_content(txt_path.to_str().unwrap()));
241
242        std::fs::remove_dir_all(&dir).ok();
243    }
244}