Skip to main content

csv_nose/
encoding.rs

1//! Encoding detection and transcoding using chardetng and `encoding_rs`.
2
3use chardetng::EncodingDetector;
4use simdutf8::basic::from_utf8;
5
6/// Check if the given bytes are valid UTF-8.
7///
8/// Uses SIMD-accelerated validation for performance.
9pub fn is_utf8(data: &[u8]) -> bool {
10    from_utf8(data).is_ok()
11}
12
13/// Check if the data starts with a UTF-8 BOM (Byte Order Mark).
14///
15/// The UTF-8 BOM is the byte sequence: EF BB BF
16pub fn has_utf8_bom(data: &[u8]) -> bool {
17    data.len() >= 3 && data[0] == 0xEF && data[1] == 0xBB && data[2] == 0xBF
18}
19
20/// Skip the UTF-8 BOM if present and return the remaining data.
21pub fn skip_bom(data: &[u8]) -> &[u8] {
22    if has_utf8_bom(data) { &data[3..] } else { data }
23}
24
25/// Detect the encoding of the data.
26///
27/// Currently only supports UTF-8 detection. Returns true if valid UTF-8.
28pub fn detect_encoding(data: &[u8]) -> EncodingInfo {
29    let has_bom = has_utf8_bom(data);
30    let data_without_bom = skip_bom(data);
31    let valid_utf8 = is_utf8(data_without_bom);
32
33    EncodingInfo {
34        is_utf8: valid_utf8,
35        has_bom,
36    }
37}
38
39/// Information about the detected encoding.
40#[derive(Debug, Clone, Copy, PartialEq, Eq)]
41pub struct EncodingInfo {
42    /// Whether the data is valid UTF-8.
43    pub is_utf8: bool,
44    /// Whether a UTF-8 BOM was present.
45    pub has_bom: bool,
46}
47
48impl EncodingInfo {
49    /// Create a new `EncodingInfo`.
50    pub const fn new(is_utf8: bool, has_bom: bool) -> Self {
51        Self { is_utf8, has_bom }
52    }
53}
54
55/// Detect the encoding of data and transcode to UTF-8 if necessary.
56///
57/// Uses chardetng for robust encoding detection supporting:
58/// - Windows-1251 (Cyrillic)
59/// - Windows-1250 (Central European)
60/// - ISO-8859 variants
61/// - GB2312/GBK (Chinese)
62/// - UTF-16 LE/BE
63/// - And many more
64///
65/// Returns (`transcoded_data`, `was_transcoded`). If `was_transcoded` is false,
66/// the original data is returned as-is (it was already valid UTF-8).
67pub fn detect_and_transcode(data: &[u8]) -> (std::borrow::Cow<'_, [u8]>, bool) {
68    // Check for UTF-16 BOM first (chardetng doesn't handle these well)
69    if data.len() >= 2 {
70        // UTF-16 LE BOM: FF FE
71        if data[0] == 0xFF && data[1] == 0xFE {
72            let (decoded, _, _) = encoding_rs::UTF_16LE.decode(data);
73            return (
74                std::borrow::Cow::Owned(decoded.into_owned().into_bytes()),
75                true,
76            );
77        }
78        // UTF-16 BE BOM: FE FF
79        if data[0] == 0xFE && data[1] == 0xFF {
80            let (decoded, _, _) = encoding_rs::UTF_16BE.decode(data);
81            return (
82                std::borrow::Cow::Owned(decoded.into_owned().into_bytes()),
83                true,
84            );
85        }
86    }
87
88    // Check if already valid UTF-8
89    if is_utf8(data) {
90        return (std::borrow::Cow::Borrowed(data), false);
91    }
92
93    // Use chardetng to detect encoding
94    let mut detector = EncodingDetector::new();
95    detector.feed(data, true);
96    let encoding = detector.guess(None, true);
97
98    // If detected as UTF-8, return as-is (might have some invalid bytes)
99    if encoding == encoding_rs::UTF_8 {
100        return (std::borrow::Cow::Borrowed(data), false);
101    }
102
103    // Transcode to UTF-8
104    let (decoded, _, _) = encoding.decode(data);
105    (
106        std::borrow::Cow::Owned(decoded.into_owned().into_bytes()),
107        true,
108    )
109}
110
111#[cfg(test)]
112mod tests {
113    use super::*;
114
115    #[test]
116    fn test_is_utf8() {
117        assert!(is_utf8(b"Hello, World!"));
118        assert!(is_utf8("こんにちは".as_bytes()));
119        assert!(is_utf8(b""));
120    }
121
122    #[test]
123    fn test_invalid_utf8() {
124        // Invalid UTF-8 sequence
125        assert!(!is_utf8(&[0xFF, 0xFE]));
126        assert!(!is_utf8(&[0x80, 0x81, 0x82]));
127    }
128
129    #[test]
130    fn test_utf8_bom() {
131        let with_bom = [0xEF, 0xBB, 0xBF, b'a', b'b', b'c'];
132        let without_bom = b"abc";
133
134        assert!(has_utf8_bom(&with_bom));
135        assert!(!has_utf8_bom(without_bom));
136
137        assert_eq!(skip_bom(&with_bom), b"abc");
138        assert_eq!(skip_bom(without_bom), b"abc");
139    }
140
141    #[test]
142    fn test_detect_encoding() {
143        let info = detect_encoding(b"Hello");
144        assert!(info.is_utf8);
145        assert!(!info.has_bom);
146
147        let with_bom = [0xEF, 0xBB, 0xBF, b'H', b'i'];
148        let info = detect_encoding(&with_bom);
149        assert!(info.is_utf8);
150        assert!(info.has_bom);
151    }
152
153    #[test]
154    fn test_detect_and_transcode_utf8() {
155        // Valid UTF-8 should not be transcoded
156        let data = b"Hello, World!";
157        let (result, was_transcoded) = detect_and_transcode(data);
158        assert!(!was_transcoded);
159        assert_eq!(&result[..], data);
160    }
161
162    #[test]
163    fn test_detect_and_transcode_utf16_le() {
164        // UTF-16 LE with BOM: "Hi"
165        let data: &[u8] = &[0xFF, 0xFE, b'H', 0x00, b'i', 0x00];
166        let (result, was_transcoded) = detect_and_transcode(data);
167        assert!(was_transcoded);
168        // Result should be UTF-8 (without BOM marker in content)
169        assert!(is_utf8(&result));
170    }
171
172    #[test]
173    fn test_detect_and_transcode_windows1251() {
174        // Windows-1251 encoded Cyrillic text: "Привет" (Hello in Russian)
175        // П=0xCF, р=0xF0, и=0xE8, в=0xE2, е=0xE5, т=0xF2
176        let data: &[u8] = &[0xCF, 0xF0, 0xE8, 0xE2, 0xE5, 0xF2];
177        let (result, was_transcoded) = detect_and_transcode(data);
178        // Should be transcoded since it's not valid UTF-8
179        assert!(was_transcoded);
180        // Result should be valid UTF-8
181        assert!(is_utf8(&result));
182    }
183}