Skip to main content

codesearch/file/
binary.rs

1use std::fs::File;
2use std::io::Read;
3use std::path::Path;
4
5/// Check if a file is binary using multiple heuristics
6///
7/// This function uses several techniques to detect binary files:
8/// 1. File extension (known binary extensions)
9/// 2. Null byte detection (most reliable for true binary files)
10/// 3. Non-printable character ratio (for text files with some binary data)
11/// 4. UTF-8 validity (text files should be valid UTF-8)
12pub fn is_binary_file(path: &Path) -> bool {
13    // First check: known binary extensions
14    if is_binary_by_extension(path) {
15        return true;
16    }
17
18    // Second check: read file content and analyze
19    is_binary_by_content(path)
20}
21
22/// Check if file has a known binary extension
23fn is_binary_by_extension(path: &Path) -> bool {
24    if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
25        matches!(
26            ext.to_lowercase().as_str(),
27            // Executables and libraries
28            "exe" | "dll" | "so" | "dylib" | "a" | "o" | "lib" | "bin"
29            // Archives
30            | "zip" | "tar" | "gz" | "bz2" | "xz" | "7z" | "rar" | "tgz"
31            // Images
32            | "png" | "jpg" | "jpeg" | "gif" | "bmp" | "ico" | "svg" | "webp"
33            // Videos
34            | "mp4" | "avi" | "mov" | "wmv" | "flv" | "mkv" | "webm"
35            // Audio
36            | "mp3" | "wav" | "ogg" | "flac" | "aac" | "wma"
37            // Documents (binary formats)
38            | "pdf" | "doc" | "docx" | "xls" | "xlsx" | "ppt" | "pptx"
39            // Other binary formats
40            | "wasm" | "pyc" | "class" | "jar" | "war"
41            // Lock files and minified (not indexable)
42            | "lock" | "min.js" | "bundle.js"
43        )
44    } else {
45        false
46    }
47}
48
49/// Check if file content appears to be binary
50fn is_binary_by_content(path: &Path) -> bool {
51    let mut file = match File::open(path) {
52        Ok(f) => f,
53        Err(_) => return false,
54    };
55
56    // Read first 8KB (sufficient for detection)
57    let mut buffer = [0u8; 8192];
58    let bytes_read = match file.read(&mut buffer) {
59        Ok(n) => n,
60        Err(_) => return false,
61    };
62
63    // Empty file is not binary
64    if bytes_read == 0 {
65        return false;
66    }
67
68    let data = &buffer[..bytes_read];
69
70    // Check 1: Null bytes are a strong indicator of binary content
71    if data.contains(&0) {
72        return true;
73    }
74
75    // Check 2: Calculate ratio of non-printable characters
76    // This includes control characters and non-ASCII bytes
77    let non_printable_count = data
78        .iter()
79        .filter(|&&b| !is_printable_or_whitespace(b))
80        .count();
81
82    let non_printable_ratio = non_printable_count as f64 / bytes_read as f64;
83
84    // If more than 30% of characters are non-printable, it's binary
85    // UNLESS it's valid UTF-8 with a lower threshold
86    if non_printable_ratio > 0.30 {
87        // Check if it's valid UTF-8 - if so, it might be text with Unicode
88        if std::str::from_utf8(data).is_err() {
89            return true;
90        }
91        // Valid UTF-8 but lots of non-ASCII - check if it's reasonable
92        // If >80% non-printable ASCII, it's likely binary even if valid UTF-8
93        if non_printable_ratio > 0.80 {
94            return true;
95        }
96    }
97
98    // Passed all checks, likely a text file
99    false
100}
101
102/// Check if a byte is printable or common whitespace
103#[inline]
104fn is_printable_or_whitespace(byte: u8) -> bool {
105    // Printable ASCII: 0x20 (space) to 0x7E (~)
106    // Common whitespace: tab (0x09), newline (0x0A), carriage return (0x0D)
107    matches!(byte, 0x09 | 0x0A | 0x0D | 0x20..=0x7E)
108}
109
110#[cfg(test)]
111mod tests {
112    use super::*;
113    use std::fs;
114    use std::io::Write;
115    use tempfile::TempDir;
116
117    #[test]
118    fn test_binary_by_extension() {
119        assert!(is_binary_by_extension(Path::new("test.exe")));
120        assert!(is_binary_by_extension(Path::new("libfoo.so")));
121        assert!(is_binary_by_extension(Path::new("image.png")));
122        assert!(is_binary_by_extension(Path::new("archive.zip")));
123        assert!(is_binary_by_extension(Path::new("video.mp4")));
124        assert!(!is_binary_by_extension(Path::new("main.rs")));
125        assert!(!is_binary_by_extension(Path::new("README.md")));
126    }
127
128    #[test]
129    fn test_text_file_detection() {
130        let dir = TempDir::new().unwrap();
131        let file_path = dir.path().join("test.txt");
132        let mut file = File::create(&file_path).unwrap();
133        writeln!(file, "This is a text file").unwrap();
134        writeln!(file, "with multiple lines").unwrap();
135        drop(file);
136
137        assert!(!is_binary_by_content(&file_path));
138    }
139
140    #[test]
141    fn test_binary_file_detection() {
142        let dir = TempDir::new().unwrap();
143        let file_path = dir.path().join("test.bin");
144        let mut file = File::create(&file_path).unwrap();
145        // Write binary data with null bytes
146        file.write_all(&[0x00, 0x01, 0x02, 0x03, 0xFF]).unwrap();
147        drop(file);
148
149        assert!(is_binary_by_content(&file_path));
150    }
151
152    #[test]
153    fn test_non_printable_ratio() {
154        let dir = TempDir::new().unwrap();
155        let file_path = dir.path().join("test.dat");
156        let mut file = File::create(&file_path).unwrap();
157        // Write mostly non-printable characters (but no nulls)
158        let data: Vec<u8> = (0x01..=0x08).cycle().take(1000).collect();
159        file.write_all(&data).unwrap();
160        drop(file);
161
162        assert!(is_binary_by_content(&file_path));
163    }
164
165    #[test]
166    fn test_utf8_validity() {
167        let dir = TempDir::new().unwrap();
168
169        // Valid UTF-8
170        let valid_path = dir.path().join("valid.txt");
171        fs::write(&valid_path, "Hello, δΈ–η•Œ!").unwrap();
172        assert!(!is_binary_by_content(&valid_path));
173
174        // Invalid UTF-8
175        let invalid_path = dir.path().join("invalid.txt");
176        fs::write(&invalid_path, [0xFF, 0xFE, 0xFD]).unwrap();
177        assert!(is_binary_by_content(&invalid_path));
178    }
179
180    #[test]
181    fn test_printable_or_whitespace() {
182        assert!(is_printable_or_whitespace(b' ')); // space
183        assert!(is_printable_or_whitespace(b'\t')); // tab
184        assert!(is_printable_or_whitespace(b'\n')); // newline
185        assert!(is_printable_or_whitespace(b'\r')); // carriage return
186        assert!(is_printable_or_whitespace(b'A'));
187        assert!(is_printable_or_whitespace(b'z'));
188        assert!(is_printable_or_whitespace(b'0'));
189        assert!(!is_printable_or_whitespace(0x00)); // null
190        assert!(!is_printable_or_whitespace(0x01)); // control char
191        assert!(!is_printable_or_whitespace(0xFF)); // non-ASCII
192    }
193}