codesearch/file/
binary.rs1use std::fs::File;
2use std::io::Read;
3use std::path::Path;
4
5pub fn is_binary_file(path: &Path) -> bool {
13 if is_binary_by_extension(path) {
15 return true;
16 }
17
18 is_binary_by_content(path)
20}
21
22fn is_binary_by_extension(path: &Path) -> bool {
24 if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
25 matches!(
26 ext.to_lowercase().as_str(),
27 "exe" | "dll" | "so" | "dylib" | "a" | "o" | "lib" | "bin"
29 | "zip" | "tar" | "gz" | "bz2" | "xz" | "7z" | "rar" | "tgz"
31 | "png" | "jpg" | "jpeg" | "gif" | "bmp" | "ico" | "svg" | "webp"
33 | "mp4" | "avi" | "mov" | "wmv" | "flv" | "mkv" | "webm"
35 | "mp3" | "wav" | "ogg" | "flac" | "aac" | "wma"
37 | "pdf" | "doc" | "docx" | "xls" | "xlsx" | "ppt" | "pptx"
39 | "wasm" | "pyc" | "class" | "jar" | "war"
41 | "lock" | "min.js" | "bundle.js"
43 )
44 } else {
45 false
46 }
47}
48
49fn is_binary_by_content(path: &Path) -> bool {
51 let mut file = match File::open(path) {
52 Ok(f) => f,
53 Err(_) => return false,
54 };
55
56 let mut buffer = [0u8; 8192];
58 let bytes_read = match file.read(&mut buffer) {
59 Ok(n) => n,
60 Err(_) => return false,
61 };
62
63 if bytes_read == 0 {
65 return false;
66 }
67
68 let data = &buffer[..bytes_read];
69
70 if data.contains(&0) {
72 return true;
73 }
74
75 let non_printable_count = data
78 .iter()
79 .filter(|&&b| !is_printable_or_whitespace(b))
80 .count();
81
82 let non_printable_ratio = non_printable_count as f64 / bytes_read as f64;
83
84 if non_printable_ratio > 0.30 {
87 if std::str::from_utf8(data).is_err() {
89 return true;
90 }
91 if non_printable_ratio > 0.80 {
94 return true;
95 }
96 }
97
98 false
100}
101
102#[inline]
104fn is_printable_or_whitespace(byte: u8) -> bool {
105 matches!(byte, 0x09 | 0x0A | 0x0D | 0x20..=0x7E)
108}
109
110#[cfg(test)]
111mod tests {
112 use super::*;
113 use std::fs;
114 use std::io::Write;
115 use tempfile::TempDir;
116
117 #[test]
118 fn test_binary_by_extension() {
119 assert!(is_binary_by_extension(Path::new("test.exe")));
120 assert!(is_binary_by_extension(Path::new("libfoo.so")));
121 assert!(is_binary_by_extension(Path::new("image.png")));
122 assert!(is_binary_by_extension(Path::new("archive.zip")));
123 assert!(is_binary_by_extension(Path::new("video.mp4")));
124 assert!(!is_binary_by_extension(Path::new("main.rs")));
125 assert!(!is_binary_by_extension(Path::new("README.md")));
126 }
127
128 #[test]
129 fn test_text_file_detection() {
130 let dir = TempDir::new().unwrap();
131 let file_path = dir.path().join("test.txt");
132 let mut file = File::create(&file_path).unwrap();
133 writeln!(file, "This is a text file").unwrap();
134 writeln!(file, "with multiple lines").unwrap();
135 drop(file);
136
137 assert!(!is_binary_by_content(&file_path));
138 }
139
140 #[test]
141 fn test_binary_file_detection() {
142 let dir = TempDir::new().unwrap();
143 let file_path = dir.path().join("test.bin");
144 let mut file = File::create(&file_path).unwrap();
145 file.write_all(&[0x00, 0x01, 0x02, 0x03, 0xFF]).unwrap();
147 drop(file);
148
149 assert!(is_binary_by_content(&file_path));
150 }
151
152 #[test]
153 fn test_non_printable_ratio() {
154 let dir = TempDir::new().unwrap();
155 let file_path = dir.path().join("test.dat");
156 let mut file = File::create(&file_path).unwrap();
157 let data: Vec<u8> = (0x01..=0x08).cycle().take(1000).collect();
159 file.write_all(&data).unwrap();
160 drop(file);
161
162 assert!(is_binary_by_content(&file_path));
163 }
164
165 #[test]
166 fn test_utf8_validity() {
167 let dir = TempDir::new().unwrap();
168
169 let valid_path = dir.path().join("valid.txt");
171 fs::write(&valid_path, "Hello, δΈη!").unwrap();
172 assert!(!is_binary_by_content(&valid_path));
173
174 let invalid_path = dir.path().join("invalid.txt");
176 fs::write(&invalid_path, [0xFF, 0xFE, 0xFD]).unwrap();
177 assert!(is_binary_by_content(&invalid_path));
178 }
179
180 #[test]
181 fn test_printable_or_whitespace() {
182 assert!(is_printable_or_whitespace(b' ')); assert!(is_printable_or_whitespace(b'\t')); assert!(is_printable_or_whitespace(b'\n')); assert!(is_printable_or_whitespace(b'\r')); assert!(is_printable_or_whitespace(b'A'));
187 assert!(is_printable_or_whitespace(b'z'));
188 assert!(is_printable_or_whitespace(b'0'));
189 assert!(!is_printable_or_whitespace(0x00)); assert!(!is_printable_or_whitespace(0x01)); assert!(!is_printable_or_whitespace(0xFF)); }
193}