rusty_files/utils/
encoding.rs

1use encoding_rs::{Encoding, UTF_8};
2use std::fs::File;
3use std::io::Read;
4use std::path::Path;
5
6pub fn detect_encoding(data: &[u8]) -> &'static Encoding {
7    let (encoding, _) = Encoding::for_bom(data).unwrap_or((UTF_8, 0));
8
9    if encoding == UTF_8 {
10        return encoding;
11    }
12
13    let mut detector = chardetng::EncodingDetector::new();
14    detector.feed(data, true);
15    detector.guess(None, true)
16}
17
18pub fn read_file_with_encoding<P: AsRef<Path>>(path: P, max_size: u64) -> std::io::Result<String> {
19    let mut file = File::open(path)?;
20    let file_size = file.metadata()?.len();
21
22    let read_size = std::cmp::min(file_size, max_size);
23    let mut buffer = vec![0u8; read_size as usize];
24
25    file.read_exact(&mut buffer)?;
26
27    let encoding = detect_encoding(&buffer);
28    let (decoded, _, had_errors) = encoding.decode(&buffer);
29
30    if had_errors {
31        Ok(String::from_utf8_lossy(&buffer).to_string())
32    } else {
33        Ok(decoded.to_string())
34    }
35}
36
37pub fn is_likely_text(data: &[u8]) -> bool {
38    if data.is_empty() {
39        return true;
40    }
41
42    let sample_size = std::cmp::min(data.len(), 8192);
43    let sample = &data[..sample_size];
44
45    let null_count = sample.iter().filter(|&&b| b == 0).count();
46    if null_count > sample_size / 10 {
47        return false;
48    }
49
50    let control_count = sample
51        .iter()
52        .filter(|&&b| b < 32 && b != b'\n' && b != b'\r' && b != b'\t')
53        .count();
54
55    // Allow at least 1 control character for small files, or 5% for larger files
56    let threshold = std::cmp::max(1, sample_size / 20);
57    control_count < threshold
58}
59
60pub fn is_utf8(data: &[u8]) -> bool {
61    std::str::from_utf8(data).is_ok()
62}
63
64#[cfg(test)]
65mod tests {
66    use super::*;
67
68    #[test]
69    fn test_is_likely_text() {
70        assert!(is_likely_text(b"Hello, world!"));
71        assert!(is_likely_text(b""));
72        assert!(!is_likely_text(&[0u8; 100]));
73    }
74
75    #[test]
76    fn test_is_utf8() {
77        assert!(is_utf8(b"Hello, world!"));
78        assert!(is_utf8("こんにちは".as_bytes()));
79        assert!(!is_utf8(&[0xFF, 0xFE, 0xFD]));
80    }
81}