rusty_files/utils/
encoding.rs1use encoding_rs::{Encoding, UTF_8};
2use std::fs::File;
3use std::io::Read;
4use std::path::Path;
5
6pub fn detect_encoding(data: &[u8]) -> &'static Encoding {
7 let (encoding, _) = Encoding::for_bom(data).unwrap_or((UTF_8, 0));
8
9 if encoding == UTF_8 {
10 return encoding;
11 }
12
13 let mut detector = chardetng::EncodingDetector::new();
14 detector.feed(data, true);
15 detector.guess(None, true)
16}
17
18pub fn read_file_with_encoding<P: AsRef<Path>>(path: P, max_size: u64) -> std::io::Result<String> {
19 let mut file = File::open(path)?;
20 let file_size = file.metadata()?.len();
21
22 let read_size = std::cmp::min(file_size, max_size);
23 let mut buffer = vec![0u8; read_size as usize];
24
25 file.read_exact(&mut buffer)?;
26
27 let encoding = detect_encoding(&buffer);
28 let (decoded, _, had_errors) = encoding.decode(&buffer);
29
30 if had_errors {
31 Ok(String::from_utf8_lossy(&buffer).to_string())
32 } else {
33 Ok(decoded.to_string())
34 }
35}
36
37pub fn is_likely_text(data: &[u8]) -> bool {
38 if data.is_empty() {
39 return true;
40 }
41
42 let sample_size = std::cmp::min(data.len(), 8192);
43 let sample = &data[..sample_size];
44
45 let null_count = sample.iter().filter(|&&b| b == 0).count();
46 if null_count > sample_size / 10 {
47 return false;
48 }
49
50 let control_count = sample
51 .iter()
52 .filter(|&&b| b < 32 && b != b'\n' && b != b'\r' && b != b'\t')
53 .count();
54
55 let threshold = std::cmp::max(1, sample_size / 20);
57 control_count < threshold
58}
59
60pub fn is_utf8(data: &[u8]) -> bool {
61 std::str::from_utf8(data).is_ok()
62}
63
64#[cfg(test)]
65mod tests {
66 use super::*;
67
68 #[test]
69 fn test_is_likely_text() {
70 assert!(is_likely_text(b"Hello, world!"));
71 assert!(is_likely_text(b""));
72 assert!(!is_likely_text(&[0u8; 100]));
73 }
74
75 #[test]
76 fn test_is_utf8() {
77 assert!(is_utf8(b"Hello, world!"));
78 assert!(is_utf8("こんにちは".as_bytes()));
79 assert!(!is_utf8(&[0xFF, 0xFE, 0xFD]));
80 }
81}