1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
use std::path::Path;
use std::fs::File;
use std::io::Read;
use std::io::Result;
use chardet;
use encoding::DecoderTrap;
use encoding::label::encoding_from_whatwg_label;
const STARTING_CHUNK_SIZE: u64 = 1024;
pub fn is_binary<P: AsRef<Path>>(path: P) -> Result<bool> {
let chunk = get_starting_chunk(path, STARTING_CHUNK_SIZE)?;
Ok(is_binary_string(chunk))
}
fn get_starting_chunk<P: AsRef<Path>>(path: P, limit: u64) -> Result<Vec<u8>> {
let f = File::open(path)?;
let mut f = f.take(limit);
let mut buffer = Vec::new();
f.read_to_end(&mut buffer)?;
Ok(buffer)
}
fn is_printable_high_ascii(value: &u8) -> bool {
match value {
127 ..= 255 => true,
_ => false
}
}
fn is_printable_ascii(value: &u8) -> bool {
match value {
32 ..= 126 => true,
b'\n' | b'\r' | b'\t' | b'\x0C' | b'\x08' => true,
_ => false
}
}
fn decoding_possible(bytes_to_check: &Vec<u8>, detected_encoding: String) -> bool {
let coder = encoding_from_whatwg_label(chardet::charset2encoding(&detected_encoding));
if let Some(coder) = coder {
coder.decode(&bytes_to_check, DecoderTrap::Strict).is_ok()
} else {
false
}
}
fn contains_null_bytes(bytes: &Vec<u8>) -> bool {
bytes.contains(&b'\x00') || bytes.contains(&b'\xff')
}
fn is_binary_string(bytes_to_check: Vec<u8>) -> bool {
if bytes_to_check.is_empty() {
return false;
}
let non_low_char_count = bytes_to_check.iter().filter(|x| !is_printable_ascii(x)).count();
let nontext_ratio1 = (non_low_char_count as f64) / (bytes_to_check.len() as f64);
let non_high_char_count = bytes_to_check.iter().filter(|x| !is_printable_high_ascii(x)).count();
let nontext_ratio2 = (non_high_char_count as f64) / (bytes_to_check.len() as f64);
let is_likely_binary =
(nontext_ratio1 > 0.3 && nontext_ratio2 < 0.05) ||
(nontext_ratio1 > 0.8 && nontext_ratio2 > 0.8)
;
let (detected_encoding, detected_encoding_confidence, ..) = chardet::detect(&bytes_to_check);
let decodable_as_unicode = if detected_encoding_confidence > 0.9 && detected_encoding != "ascii" {
decoding_possible(&bytes_to_check, detected_encoding)
} else {
false
};
if is_likely_binary {
!decodable_as_unicode
} else {
if decodable_as_unicode {
false
} else {
contains_null_bytes(&bytes_to_check)
}
}
}