use chardetng::EncodingDetector;
use simdutf8::basic::from_utf8;
pub fn is_utf8(data: &[u8]) -> bool {
from_utf8(data).is_ok()
}
pub fn has_utf8_bom(data: &[u8]) -> bool {
data.len() >= 3 && data[0] == 0xEF && data[1] == 0xBB && data[2] == 0xBF
}
pub fn skip_bom(data: &[u8]) -> &[u8] {
if has_utf8_bom(data) { &data[3..] } else { data }
}
pub fn detect_encoding(data: &[u8]) -> EncodingInfo {
let has_bom = has_utf8_bom(data);
let data_without_bom = skip_bom(data);
let valid_utf8 = is_utf8(data_without_bom);
EncodingInfo {
is_utf8: valid_utf8,
has_bom,
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct EncodingInfo {
pub is_utf8: bool,
pub has_bom: bool,
}
impl EncodingInfo {
pub const fn new(is_utf8: bool, has_bom: bool) -> Self {
Self { is_utf8, has_bom }
}
}
pub fn detect_and_transcode(data: &[u8]) -> (std::borrow::Cow<'_, [u8]>, bool) {
if data.len() >= 2 {
if data[0] == 0xFF && data[1] == 0xFE {
let (decoded, _, _) = encoding_rs::UTF_16LE.decode(data);
return (
std::borrow::Cow::Owned(decoded.into_owned().into_bytes()),
true,
);
}
if data[0] == 0xFE && data[1] == 0xFF {
let (decoded, _, _) = encoding_rs::UTF_16BE.decode(data);
return (
std::borrow::Cow::Owned(decoded.into_owned().into_bytes()),
true,
);
}
}
if is_utf8(data) {
return (std::borrow::Cow::Borrowed(data), false);
}
let mut detector = EncodingDetector::new();
detector.feed(data, true);
let encoding = detector.guess(None, true);
if encoding == encoding_rs::UTF_8 {
return (std::borrow::Cow::Borrowed(data), false);
}
let (decoded, _, _) = encoding.decode(data);
(
std::borrow::Cow::Owned(decoded.into_owned().into_bytes()),
true,
)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_is_utf8() {
assert!(is_utf8(b"Hello, World!"));
assert!(is_utf8("こんにちは".as_bytes()));
assert!(is_utf8(b""));
}
#[test]
fn test_invalid_utf8() {
assert!(!is_utf8(&[0xFF, 0xFE]));
assert!(!is_utf8(&[0x80, 0x81, 0x82]));
}
#[test]
fn test_utf8_bom() {
let with_bom = [0xEF, 0xBB, 0xBF, b'a', b'b', b'c'];
let without_bom = b"abc";
assert!(has_utf8_bom(&with_bom));
assert!(!has_utf8_bom(without_bom));
assert_eq!(skip_bom(&with_bom), b"abc");
assert_eq!(skip_bom(without_bom), b"abc");
}
#[test]
fn test_detect_encoding() {
let info = detect_encoding(b"Hello");
assert!(info.is_utf8);
assert!(!info.has_bom);
let with_bom = [0xEF, 0xBB, 0xBF, b'H', b'i'];
let info = detect_encoding(&with_bom);
assert!(info.is_utf8);
assert!(info.has_bom);
}
#[test]
fn test_detect_and_transcode_utf8() {
let data = b"Hello, World!";
let (result, was_transcoded) = detect_and_transcode(data);
assert!(!was_transcoded);
assert_eq!(&result[..], data);
}
#[test]
fn test_detect_and_transcode_utf16_le() {
let data: &[u8] = &[0xFF, 0xFE, b'H', 0x00, b'i', 0x00];
let (result, was_transcoded) = detect_and_transcode(data);
assert!(was_transcoded);
assert!(is_utf8(&result));
}
#[test]
fn test_detect_and_transcode_windows1251() {
let data: &[u8] = &[0xCF, 0xF0, 0xE8, 0xE2, 0xE5, 0xF2];
let (result, was_transcoded) = detect_and_transcode(data);
assert!(was_transcoded);
assert!(is_utf8(&result));
}
}