1use std::path::Path;
2
3const BINARY_EXTENSIONS: &[&str] = &[
4 "parquet",
6 "avro",
7 "orc",
8 "arrow",
9 "feather",
10 "hdf5",
11 "h5",
12 "npy",
13 "npz",
14 "db",
16 "sqlite",
17 "sqlite3",
18 "mdb",
19 "accdb",
20 "ldb",
21 "zip",
23 "gz",
24 "tar",
25 "bz2",
26 "xz",
27 "7z",
28 "rar",
29 "zst",
30 "lz4",
31 "lzma",
32 "png",
34 "jpg",
35 "jpeg",
36 "gif",
37 "webp",
38 "bmp",
39 "ico",
40 "tiff",
41 "tif",
42 "svg",
43 "psd",
44 "raw",
45 "cr2",
46 "nef",
47 "heic",
48 "heif",
49 "avif",
50 "mp3",
52 "mp4",
53 "wav",
54 "flac",
55 "ogg",
56 "avi",
57 "mkv",
58 "mov",
59 "webm",
60 "m4a",
61 "exe",
63 "dll",
64 "so",
65 "dylib",
66 "o",
67 "a",
68 "obj",
69 "lib",
70 "pdb",
71 "class",
72 "jar",
73 "war",
74 "ear",
75 "pyc",
77 "pyo",
78 "whl",
79 "egg",
80 "beam",
81 "wasm",
82 "wast",
83 "model",
85 "onnx",
86 "pt",
87 "pth",
88 "safetensors",
89 "gguf",
90 "ggml",
91 "tflite",
92 "pb",
93 "h5",
94 "keras",
95 "pkl",
97 "pickle",
98 "bin",
99 "dat",
100 "protobuf",
101 "pdf",
103 "doc",
104 "docx",
105 "xls",
106 "xlsx",
107 "ppt",
108 "pptx",
109 "odt",
110 "ods",
111 "ttf",
113 "otf",
114 "woff",
115 "woff2",
116 "eot",
117 "iso",
119 "img",
120 "vmdk",
121 "qcow2",
122];
123
124fn has_binary_extension(path: &str) -> bool {
126 Path::new(path)
127 .extension()
128 .and_then(|e| e.to_str())
129 .map(str::to_ascii_lowercase)
130 .is_some_and(|ext| BINARY_EXTENSIONS.contains(&ext.as_str()))
131}
132
133fn has_binary_content(path: &str) -> bool {
136 let Ok(file) = std::fs::File::open(path) else {
137 return false;
138 };
139 use std::io::Read;
140 let mut buf = [0u8; 8192];
141 let mut reader = std::io::BufReader::new(file);
142 let Ok(n) = reader.read(&mut buf) else {
143 return false;
144 };
145 buf[..n].contains(&0)
146}
147
148pub fn is_binary_file(path: &str) -> bool {
151 if has_binary_extension(path) {
152 return true;
153 }
154 has_binary_content(path)
155}
156
157fn file_type_label(path: &str) -> &'static str {
159 let ext = Path::new(path)
160 .extension()
161 .and_then(|e| e.to_str())
162 .unwrap_or("");
163 match ext.to_ascii_lowercase().as_str() {
164 "parquet" | "avro" | "orc" | "arrow" | "feather" => "columnar data file",
165 "hdf5" | "h5" | "npy" | "npz" => "scientific data file",
166 "db" | "sqlite" | "sqlite3" => "database file",
167 "zip" | "gz" | "tar" | "bz2" | "xz" | "7z" | "rar" | "zst" => "compressed archive",
168 "png" | "jpg" | "jpeg" | "gif" | "webp" | "bmp" | "ico" | "heic" => "image file",
169 "mp3" | "mp4" | "wav" | "flac" | "ogg" | "avi" | "mkv" | "mov" => "media file",
170 "exe" | "dll" | "so" | "dylib" => "native binary",
171 "wasm" => "WebAssembly binary",
172 "pdf" => "PDF document",
173 "onnx" | "pt" | "pth" | "safetensors" | "gguf" | "ggml" => "ML model file",
174 "pkl" | "pickle" => "serialized object",
175 "pyc" | "pyo" => "Python bytecode",
176 "class" | "jar" | "war" => "Java bytecode",
177 _ => "binary file",
178 }
179}
180
181pub fn binary_file_message(path: &str) -> String {
183 let ext = Path::new(path)
184 .extension()
185 .and_then(|e| e.to_str())
186 .unwrap_or("unknown");
187 let label = file_type_label(path);
188 format!(
189 "Binary file detected (.{ext}, {label}). \
190 lean-ctx cannot read binary files as text. \
191 Use a specialized tool for this file type."
192 )
193}
194
195#[cfg(test)]
196mod tests {
197 use super::*;
198
199 #[test]
200 fn detects_binary_extensions() {
201 assert!(has_binary_extension("data.parquet"));
202 assert!(has_binary_extension("model.onnx"));
203 assert!(has_binary_extension("archive.tar.gz"));
204 assert!(has_binary_extension("photo.PNG"));
205 assert!(has_binary_extension("/path/to/file.sqlite3"));
206 }
207
208 #[test]
209 fn rejects_text_extensions() {
210 assert!(!has_binary_extension("main.rs"));
211 assert!(!has_binary_extension("config.toml"));
212 assert!(!has_binary_extension("README.md"));
213 assert!(!has_binary_extension("script.py"));
214 }
215
216 #[test]
217 fn message_includes_type() {
218 let msg = binary_file_message("data.parquet");
219 assert!(msg.contains("columnar data file"));
220 assert!(msg.contains(".parquet"));
221 }
222
223 #[test]
224 fn message_for_unknown_binary() {
225 let msg = binary_file_message("file.xyz");
226 assert!(msg.contains("binary file"));
227 }
228
229 #[test]
230 fn null_byte_detection() {
231 let dir = std::env::temp_dir().join("lean_ctx_binary_test");
232 std::fs::create_dir_all(&dir).ok();
233
234 let bin_path = dir.join("test.bin");
235 std::fs::write(&bin_path, b"\x00\x01\x02\x03").unwrap();
236 assert!(has_binary_content(bin_path.to_str().unwrap()));
237
238 let txt_path = dir.join("test.txt");
239 std::fs::write(&txt_path, b"hello world").unwrap();
240 assert!(!has_binary_content(txt_path.to_str().unwrap()));
241
242 std::fs::remove_dir_all(&dir).ok();
243 }
244}