use std::io::{Error, Read};
use std::ops::Range;
const BINARY_CHAR_THRESHOLD: i8 = 5;
const BUFFER_CHECK_AMOUNT: usize = 255;
#[allow(clippy::module_name_repetitions)]
pub fn is_file_likely_binary<R: Read>(file: &mut R) -> Result<bool, Error> {
let mut buf: [u8; BUFFER_CHECK_AMOUNT] = [0; BUFFER_CHECK_AMOUNT];
let bytes_read = file.read(&mut buf)?;
let num_binary_chars = String::from_utf8_lossy(&buf[..bytes_read])
.chars()
.filter(|&c| was_utf8_char_replaced(c) || is_binary_char(c))
.count();
Ok(num_binary_chars > BINARY_CHAR_THRESHOLD as usize)
}
fn was_utf8_char_replaced(c: char) -> bool {
c == std::char::REPLACEMENT_CHARACTER
}
fn is_binary_char(c: char) -> bool {
#[allow(clippy::unreadable_literal)]
#[rustfmt::skip]
let binary_codepoint_ranges = [
Range::<u32>{ start: 0x0000, end: 0x0007 + 1},
Range::<u32>{ start: 0x000b, end: 0x000b + 1},
Range::<u32>{ start: 0x000e, end: 0x001f + 1},
Range::<u32>{ start: 0x007f, end: 0x009f + 1},
Range::<u32>{ start: 0x2028, end: 0x2028 + 1},
Range::<u32>{ start: 0x2029, end: 0x2029 + 1},
Range::<u32>{ start: 0xd800, end: 0xd800 + 1},
Range::<u32>{ start: 0xdb7f, end: 0xdb80 + 1},
Range::<u32>{ start: 0xdbff, end: 0xdc00 + 1},
Range::<u32>{ start: 0xdfff, end: 0xdfff + 1},
Range::<u32>{ start: 0xe000, end: 0xe000 + 1},
Range::<u32>{ start: 0xf8ff, end: 0xf8ff + 1},
Range::<u32>{ start: 0xf0000, end: 0xf0000 + 1},
Range::<u32>{ start: 0xffffd, end: 0xffffd + 1},
Range::<u32>{ start: 0x100000, end: 0x100000 + 1},
Range::<u32>{ start: 0x10fffd, end: 0x10fffd + 1},
];
binary_codepoint_ranges
.into_iter()
.any(|range| range.contains(&(c as u32)))
}
#[cfg(test)]
mod tests {
use super::*;
use std::io::Cursor;
use test_case::test_case;
#[test_case(b"hello", false; "simple string is utf-8")]
#[test_case(b"hello\xff\xffworld", false; "single non-utf-8 is ok")]
#[test_case(b"hello\x00world", false; "single binary char is ok")]
#[test_case(b"hello\x00\xffworld", false; "single non-utf-8 and binary is ok")]
#[test_case(b"hello\xff\xffworld\xfa\xfb\xfc\xfd\xfe", true; "too many non-utf-8 is not ok")]
#[test_case(b"hello\0\0\0\0\0\0world", true; "null terms are binary chars")]
#[test_case(b"\x7f\x45\x4c\x46\x02\x01\x01\x00\x00 ", true; "elf header is binary")]
fn test_is_file_likely_utf8(s: &[u8], is_utf8: bool) {
let mut byte_reader = Cursor::new(s);
assert_eq!(is_utf8, is_file_likely_binary(&mut byte_reader).unwrap());
}
}