Skip to main content

sqry_core/io/
binary.rs

1//! Binary file detection utilities
2//!
3//! Provides heuristics to determine whether a file (or byte slice) should be
4//! considered binary. The implementation favors determinism and avoids false
5//! positives by combining null-byte checks with a ratio of printable characters.
6
7use std::fs::File;
8use std::io::{Read, Result as IoResult};
9use std::path::Path;
10
11use crate::config::buffers::DEFAULT_READ_BUFFER;
12
13/// Number of bytes sampled when inspecting a file.
14/// Uses the default read buffer size (8 KiB) for consistency.
15const SAMPLE_SIZE: usize = DEFAULT_READ_BUFFER;
16
17/// Threshold percentage of non-printable bytes (excluding common whitespace) above which a
18/// file is considered binary.
19const NON_PRINTABLE_THRESHOLD_PERCENT: usize = 30;
20
21/// Determine if the provided byte slice should be treated as binary.
22///
23/// The heuristic mirrors common implementations used by tools such as ripgrep
24/// and git. A slice is considered binary if:
25/// - It contains a NUL byte (`0x00`), or
26/// - More than 30% of bytes are outside the printable ASCII range.
27#[must_use]
28pub fn is_binary_bytes(data: &[u8]) -> bool {
29    if data.is_empty() {
30        return false;
31    }
32
33    if data.contains(&0) {
34        return true;
35    }
36
37    let non_printable = data
38        .iter()
39        .filter(|&&byte| !is_printable_ascii(byte))
40        .count();
41
42    non_printable.saturating_mul(100) > data.len().saturating_mul(NON_PRINTABLE_THRESHOLD_PERCENT)
43}
44
45/// Detect whether a file at `path` is likely binary by sampling the first 8 KiB.
46///
47/// # Errors
48///
49/// Returns [`std::io::Error`] when the file cannot be opened or read.
50pub fn is_binary_file(path: &Path) -> IoResult<bool> {
51    let mut file = File::open(path)?;
52    let mut buffer = vec![0u8; SAMPLE_SIZE];
53    let read = file.read(&mut buffer)?;
54    Ok(is_binary_bytes(&buffer[..read]))
55}
56
57fn is_printable_ascii(byte: u8) -> bool {
58    matches!(byte, 0x09 | 0x0A | 0x0D | 0x0C | 0x0B | 0x20..=0x7E)
59}
60
61#[cfg(test)]
62mod tests {
63    use super::*;
64    use std::io::Write;
65    use tempfile::NamedTempFile;
66
67    #[test]
68    fn detects_ascii_text_as_non_binary() {
69        let data = b"hello world\nthis is text";
70        assert!(!is_binary_bytes(data));
71    }
72
73    #[test]
74    fn detects_null_byte_as_binary() {
75        let data = b"hello\0world";
76        assert!(is_binary_bytes(data));
77    }
78
79    #[test]
80    fn detects_high_ratio_non_printable_as_binary() {
81        let data = [0x01u8; 100];
82        assert!(is_binary_bytes(&data));
83    }
84
85    #[test]
86    fn file_detection_respects_null_bytes() {
87        let mut temp = NamedTempFile::new().unwrap();
88        temp.write_all(b"text before\0text after").unwrap();
89        temp.flush().unwrap();
90        assert!(is_binary_file(temp.path()).unwrap());
91    }
92
93    #[test]
94    fn file_detection_handles_text() {
95        let mut temp = NamedTempFile::new().unwrap();
96        temp.write_all(b"plain ascii text\n").unwrap();
97        temp.flush().unwrap();
98        assert!(!is_binary_file(temp.path()).unwrap());
99    }
100}