use super::encoding_heuristics::has_windows1250_pattern;
use schemars::JsonSchema;
use serde::{Deserialize, Serialize};
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize, JsonSchema)]
pub enum Encoding {
#[default]
Utf8,
Utf8Bom,
Utf16Le,
Utf16Be,
Ascii,
Latin1,
Windows1252,
Windows1250,
Gb18030,
Gbk,
ShiftJis,
EucKr,
}
impl Encoding {
pub fn display_name(&self) -> &'static str {
match self {
Self::Utf8 => "UTF-8",
Self::Utf8Bom => "UTF-8 BOM",
Self::Utf16Le => "UTF-16 LE",
Self::Utf16Be => "UTF-16 BE",
Self::Ascii => "ASCII",
Self::Latin1 => "Latin-1",
Self::Windows1252 => "Windows-1252",
Self::Windows1250 => "Windows-1250",
Self::Gb18030 => "GB18030",
Self::Gbk => "GBK",
Self::ShiftJis => "Shift-JIS",
Self::EucKr => "EUC-KR",
}
}
pub fn description(&self) -> &'static str {
match self {
Self::Utf8 => "UTF-8",
Self::Utf8Bom => "UTF-8 with BOM",
Self::Utf16Le => "UTF-16 Little Endian",
Self::Utf16Be => "UTF-16 Big Endian",
Self::Ascii => "US-ASCII",
Self::Latin1 => "ISO-8859-1 / Latin-1 – Western European",
Self::Windows1252 => "Windows-1252 / CP1252 – Western European",
Self::Windows1250 => "Windows-1250 / CP1250 – Central European",
Self::Gb18030 => "GB18030 – Chinese",
Self::Gbk => "GBK / CP936 – Simplified Chinese",
Self::ShiftJis => "Shift_JIS – Japanese",
Self::EucKr => "EUC-KR – Korean",
}
}
pub fn to_encoding_rs(&self) -> &'static encoding_rs::Encoding {
match self {
Self::Utf8 | Self::Utf8Bom | Self::Ascii => encoding_rs::UTF_8,
Self::Utf16Le => encoding_rs::UTF_16LE,
Self::Utf16Be => encoding_rs::UTF_16BE,
Self::Latin1 => encoding_rs::WINDOWS_1252, Self::Windows1252 => encoding_rs::WINDOWS_1252,
Self::Windows1250 => encoding_rs::WINDOWS_1250,
Self::Gb18030 => encoding_rs::GB18030,
Self::Gbk => encoding_rs::GBK,
Self::ShiftJis => encoding_rs::SHIFT_JIS,
Self::EucKr => encoding_rs::EUC_KR,
}
}
pub fn has_bom(&self) -> bool {
matches!(self, Self::Utf8Bom | Self::Utf16Le | Self::Utf16Be)
}
pub fn bom_bytes(&self) -> Option<&'static [u8]> {
match self {
Self::Utf8Bom => Some(&[0xEF, 0xBB, 0xBF]),
Self::Utf16Le => Some(&[0xFF, 0xFE]),
Self::Utf16Be => Some(&[0xFE, 0xFF]),
_ => None,
}
}
pub fn all() -> &'static [Encoding] {
&[
Self::Utf8,
Self::Utf8Bom,
Self::Utf16Le,
Self::Utf16Be,
Self::Ascii,
Self::Latin1,
Self::Windows1252,
Self::Windows1250,
Self::Gb18030,
Self::Gbk,
Self::ShiftJis,
Self::EucKr,
]
}
pub fn is_resynchronizable(&self) -> bool {
match self {
Self::Ascii | Self::Latin1 | Self::Windows1252 | Self::Windows1250 => true,
Self::Utf8 | Self::Utf8Bom => true,
Self::Utf16Le | Self::Utf16Be => true,
Self::Gb18030 | Self::Gbk | Self::ShiftJis | Self::EucKr => false,
}
}
pub fn alignment(&self) -> Option<usize> {
match self {
Self::Ascii | Self::Latin1 | Self::Windows1252 | Self::Windows1250 => Some(1),
Self::Utf8 | Self::Utf8Bom => Some(1),
Self::Utf16Le | Self::Utf16Be => Some(2),
Self::Gb18030 | Self::Gbk | Self::ShiftJis | Self::EucKr => None,
}
}
pub fn requires_full_file_load(&self) -> bool {
!self.is_resynchronizable()
}
}
pub fn detect_encoding(bytes: &[u8]) -> Encoding {
detect_encoding_or_binary(bytes).0
}
pub fn detect_encoding_or_binary(bytes: &[u8]) -> (Encoding, bool) {
let check_len = bytes.len().min(8 * 1024);
let sample = &bytes[..check_len];
if sample.starts_with(&[0xEF, 0xBB, 0xBF]) {
return (Encoding::Utf8Bom, false);
}
if sample.starts_with(&[0xFF, 0xFE]) {
return (Encoding::Utf16Le, false);
}
if sample.starts_with(&[0xFE, 0xFF]) {
return (Encoding::Utf16Be, false);
}
let utf8_valid_len = match std::str::from_utf8(sample) {
Ok(_) => sample.len(),
Err(e) => {
if e.error_len().is_none() {
e.valid_up_to()
} else {
0
}
}
};
if utf8_valid_len > 0 && (utf8_valid_len == sample.len() || utf8_valid_len >= sample.len() - 3)
{
let valid_sample = &sample[..utf8_valid_len];
let has_binary_control = valid_sample.iter().any(|&b| is_binary_control_char(b));
if has_binary_control {
return (Encoding::Utf8, true);
}
if valid_sample.iter().all(|&b| b < 128) {
return (Encoding::Ascii, false);
}
return (Encoding::Utf8, false);
}
if sample.len() >= 4 {
let is_printable_or_high = |b: u8| (0x20..=0x7E).contains(&b) || b >= 0x80;
let aligned_len = sample.len() & !1; let aligned_sample = &sample[..aligned_len];
let le_pairs = aligned_sample
.chunks(2)
.filter(|chunk| chunk[1] == 0 && is_printable_or_high(chunk[0]))
.count();
let be_pairs = aligned_sample
.chunks(2)
.filter(|chunk| chunk[0] == 0 && is_printable_or_high(chunk[1]))
.count();
let pair_count = aligned_len / 2;
if le_pairs > pair_count / 2 {
return (Encoding::Utf16Le, false);
}
if be_pairs > pair_count / 2 {
return (Encoding::Utf16Be, false);
}
}
let has_binary_control = sample
.iter()
.any(|&b| b == 0x00 || is_binary_control_char(b));
if has_binary_control {
return (Encoding::Utf8, true);
}
let has_latin1_pattern = has_latin1_high_byte_pattern(sample);
let has_cjk_only_bytes = sample.iter().any(|&b| (0x81..0xA0).contains(&b));
let mut detector = chardetng::EncodingDetector::new();
detector.feed(sample, true);
let (detected_encoding, confident) = detector.guess_assess(None, true);
if confident {
let is_cjk_encoding = detected_encoding == encoding_rs::GB18030
|| detected_encoding == encoding_rs::GBK
|| detected_encoding == encoding_rs::SHIFT_JIS
|| detected_encoding == encoding_rs::EUC_KR;
if is_cjk_encoding && !has_cjk_only_bytes && has_latin1_pattern {
return (Encoding::Windows1252, false);
}
let encoding =
if detected_encoding == encoding_rs::GB18030 || detected_encoding == encoding_rs::GBK {
Encoding::Gb18030
} else if detected_encoding == encoding_rs::SHIFT_JIS {
Encoding::ShiftJis
} else if detected_encoding == encoding_rs::EUC_KR {
Encoding::EucKr
} else if detected_encoding == encoding_rs::WINDOWS_1252
|| detected_encoding == encoding_rs::WINDOWS_1250
{
if has_windows1250_pattern(sample) {
Encoding::Windows1250
} else {
Encoding::Windows1252
}
} else if detected_encoding == encoding_rs::UTF_8 {
if has_windows1250_pattern(sample) {
Encoding::Windows1250
} else {
Encoding::Windows1252
}
} else {
if has_windows1250_pattern(sample) {
Encoding::Windows1250
} else {
Encoding::Windows1252
}
};
return (encoding, false);
}
if has_windows1250_pattern(sample) {
(Encoding::Windows1250, false)
} else {
(Encoding::Windows1252, false)
}
}
pub fn is_binary_control_char(byte: u8) -> bool {
if byte < 0x20 {
!matches!(byte, 0x09 | 0x0A | 0x0D | 0x0C | 0x0B | 0x1B)
} else if byte == 0x7F {
true
} else {
false
}
}
fn has_latin1_high_byte_pattern(sample: &[u8]) -> bool {
let mut latin1_indicators = 0;
let mut i = 0;
while i < sample.len() {
let byte = sample[i];
if byte < 0x80 {
if byte == 0x20 && i + 1 < sample.len() {
let next = sample[i + 1];
if next >= 0xA0 {
latin1_indicators += 1;
}
}
i += 1;
continue;
}
if i + 1 < sample.len() {
let next = sample[i + 1];
let is_valid_cjk_lead = (0x81..=0xFE).contains(&byte);
let is_valid_cjk_trail = (0x40..=0x7E).contains(&next) || (0x80..=0xFE).contains(&next);
if is_valid_cjk_lead && is_valid_cjk_trail {
i += 2;
continue;
}
if byte >= 0xA0 && next < 0x40 {
latin1_indicators += 1;
}
}
i += 1;
}
latin1_indicators > 0
}
pub fn detect_and_convert(bytes: &[u8]) -> (Encoding, Vec<u8>) {
if bytes.is_empty() {
return (Encoding::Utf8, Vec::new());
}
let encoding = detect_encoding(bytes);
match encoding {
Encoding::Utf8 | Encoding::Ascii => {
(encoding, bytes.to_vec())
}
Encoding::Utf8Bom => {
let content = if bytes.len() > 3 {
bytes[3..].to_vec()
} else {
Vec::new()
};
(encoding, content)
}
Encoding::Utf16Le | Encoding::Utf16Be => {
let enc_rs = encoding.to_encoding_rs();
let start_offset =
if bytes.starts_with(&[0xFF, 0xFE]) || bytes.starts_with(&[0xFE, 0xFF]) {
2 } else {
0
};
let data = &bytes[start_offset..];
let (cow, _had_errors) = enc_rs.decode_without_bom_handling(data);
(encoding, cow.into_owned().into_bytes())
}
_ => {
let enc_rs = encoding.to_encoding_rs();
let (cow, _had_errors) = enc_rs.decode_without_bom_handling(bytes);
(encoding, cow.into_owned().into_bytes())
}
}
}
pub fn convert_to_utf8(bytes: &[u8], encoding: Encoding) -> Vec<u8> {
if bytes.is_empty() {
return Vec::new();
}
match encoding {
Encoding::Utf8 | Encoding::Ascii => {
bytes.to_vec()
}
Encoding::Utf8Bom => {
if bytes.starts_with(&[0xEF, 0xBB, 0xBF]) && bytes.len() > 3 {
bytes[3..].to_vec()
} else {
bytes.to_vec()
}
}
Encoding::Utf16Le | Encoding::Utf16Be => {
let enc_rs = encoding.to_encoding_rs();
let start_offset =
if bytes.starts_with(&[0xFF, 0xFE]) || bytes.starts_with(&[0xFE, 0xFF]) {
2 } else {
0
};
let data = &bytes[start_offset..];
let (cow, _had_errors) = enc_rs.decode_without_bom_handling(data);
cow.into_owned().into_bytes()
}
_ => {
let enc_rs = encoding.to_encoding_rs();
let (cow, _had_errors) = enc_rs.decode_without_bom_handling(bytes);
cow.into_owned().into_bytes()
}
}
}
pub fn convert_from_utf8(utf8_bytes: &[u8], encoding: Encoding) -> Vec<u8> {
match encoding {
Encoding::Utf8 | Encoding::Ascii | Encoding::Utf8Bom => {
utf8_bytes.to_vec()
}
Encoding::Utf16Le => {
let text = String::from_utf8_lossy(utf8_bytes);
let mut result = Vec::new();
for code_unit in text.encode_utf16() {
result.extend_from_slice(&code_unit.to_le_bytes());
}
result
}
Encoding::Utf16Be => {
let text = String::from_utf8_lossy(utf8_bytes);
let mut result = Vec::new();
for code_unit in text.encode_utf16() {
result.extend_from_slice(&code_unit.to_be_bytes());
}
result
}
_ => {
let enc_rs = encoding.to_encoding_rs();
let text = String::from_utf8_lossy(utf8_bytes);
let (cow, _encoding_used, _had_errors) = enc_rs.encode(&text);
cow.into_owned()
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_encoding_display_names() {
assert_eq!(Encoding::Utf8.display_name(), "UTF-8");
assert_eq!(Encoding::Utf8Bom.display_name(), "UTF-8 BOM");
assert_eq!(Encoding::Utf16Le.display_name(), "UTF-16 LE");
assert_eq!(Encoding::Gb18030.display_name(), "GB18030");
assert_eq!(Encoding::Windows1250.display_name(), "Windows-1250");
}
#[test]
fn test_encoding_bom() {
assert!(Encoding::Utf8Bom.has_bom());
assert!(Encoding::Utf16Le.has_bom());
assert!(!Encoding::Utf8.has_bom());
assert!(!Encoding::Windows1252.has_bom());
assert!(!Encoding::Windows1250.has_bom());
}
#[test]
fn test_detect_utf8() {
assert_eq!(detect_encoding(b"Hello, world!"), Encoding::Ascii);
assert_eq!(detect_encoding("Hello, 世界!".as_bytes()), Encoding::Utf8);
}
#[test]
fn test_detect_utf8_bom() {
let with_bom = [0xEF, 0xBB, 0xBF, b'H', b'i'];
assert_eq!(detect_encoding(&with_bom), Encoding::Utf8Bom);
}
#[test]
fn test_detect_utf16_le() {
let utf16_le_bom = [0xFF, 0xFE, b'H', 0x00, b'i', 0x00];
assert_eq!(detect_encoding(&utf16_le_bom), Encoding::Utf16Le);
}
#[test]
fn test_detect_binary() {
let binary_data = [0x00, 0x01, 0x02, 0x03];
let (_, is_binary) = detect_encoding_or_binary(&binary_data);
assert!(is_binary);
}
#[test]
fn test_is_binary_control_char() {
assert!(is_binary_control_char(0x00)); assert!(is_binary_control_char(0x01)); assert!(is_binary_control_char(0x02)); assert!(is_binary_control_char(0x7F));
assert!(!is_binary_control_char(0x09)); assert!(!is_binary_control_char(0x0A)); assert!(!is_binary_control_char(0x0D)); assert!(!is_binary_control_char(0x1B));
assert!(!is_binary_control_char(b'A'));
assert!(!is_binary_control_char(b' '));
}
#[test]
fn test_convert_roundtrip_utf8() {
let original = "Hello, 世界!";
let bytes = original.as_bytes();
let (encoding, utf8_content) = detect_and_convert(bytes);
assert_eq!(encoding, Encoding::Utf8);
assert_eq!(utf8_content, bytes);
let back = convert_from_utf8(&utf8_content, encoding);
assert_eq!(back, bytes);
}
#[test]
fn test_convert_roundtrip_utf16le() {
let utf16_le = [0xFF, 0xFE, b'H', 0x00, b'i', 0x00];
let (encoding, utf8_content) = detect_and_convert(&utf16_le);
assert_eq!(encoding, Encoding::Utf16Le);
assert_eq!(utf8_content, b"Hi");
let back = convert_from_utf8(&utf8_content, encoding);
assert_eq!(back, [b'H', 0x00, b'i', 0x00]);
}
#[test]
fn test_encoding_resynchronizable() {
assert!(Encoding::Utf8.is_resynchronizable());
assert!(Encoding::Utf8Bom.is_resynchronizable());
assert!(Encoding::Ascii.is_resynchronizable());
assert!(Encoding::Latin1.is_resynchronizable());
assert!(Encoding::Windows1252.is_resynchronizable());
assert!(Encoding::Windows1250.is_resynchronizable());
assert!(Encoding::Utf16Le.is_resynchronizable());
assert!(Encoding::Utf16Be.is_resynchronizable());
assert!(!Encoding::Gb18030.is_resynchronizable());
assert!(!Encoding::Gbk.is_resynchronizable());
assert!(!Encoding::ShiftJis.is_resynchronizable());
assert!(!Encoding::EucKr.is_resynchronizable());
}
#[test]
fn test_encoding_alignment() {
assert_eq!(Encoding::Ascii.alignment(), Some(1));
assert_eq!(Encoding::Latin1.alignment(), Some(1));
assert_eq!(Encoding::Windows1252.alignment(), Some(1));
assert_eq!(Encoding::Windows1250.alignment(), Some(1));
assert_eq!(Encoding::Utf8.alignment(), Some(1));
assert_eq!(Encoding::Utf8Bom.alignment(), Some(1));
assert_eq!(Encoding::Utf16Le.alignment(), Some(2));
assert_eq!(Encoding::Utf16Be.alignment(), Some(2));
assert_eq!(Encoding::Gb18030.alignment(), None);
assert_eq!(Encoding::Gbk.alignment(), None);
assert_eq!(Encoding::ShiftJis.alignment(), None);
assert_eq!(Encoding::EucKr.alignment(), None);
}
#[test]
fn test_requires_full_file_load() {
assert!(!Encoding::Utf8.requires_full_file_load());
assert!(!Encoding::Ascii.requires_full_file_load());
assert!(!Encoding::Latin1.requires_full_file_load());
assert!(!Encoding::Windows1250.requires_full_file_load());
assert!(!Encoding::Utf16Le.requires_full_file_load());
assert!(Encoding::Gb18030.requires_full_file_load());
assert!(Encoding::Gbk.requires_full_file_load());
assert!(Encoding::ShiftJis.requires_full_file_load());
assert!(Encoding::EucKr.requires_full_file_load());
}
#[test]
fn test_convert_roundtrip_windows1250() {
let windows1250_bytes: &[u8] = &[0x5A, 0x61, 0xBF, 0xF3, 0xB3, 0xE6];
let enc_rs = Encoding::Windows1250.to_encoding_rs();
let (decoded, _had_errors) = enc_rs.decode_without_bom_handling(windows1250_bytes);
let utf8_content = decoded.as_bytes();
let utf8_str = std::str::from_utf8(utf8_content).unwrap();
assert!(utf8_str.contains('ż'), "Should contain ż: {}", utf8_str);
assert!(utf8_str.contains('ó'), "Should contain ó: {}", utf8_str);
assert!(utf8_str.contains('ł'), "Should contain ł: {}", utf8_str);
assert!(utf8_str.contains('ć'), "Should contain ć: {}", utf8_str);
let back = convert_from_utf8(utf8_content, Encoding::Windows1250);
assert_eq!(back, windows1250_bytes, "Round-trip should preserve bytes");
}
#[test]
fn test_windows1250_description() {
assert_eq!(
Encoding::Windows1250.description(),
"Windows-1250 / CP1250 – Central European"
);
}
#[test]
fn test_detect_windows1250_definitive_bytes() {
let with_t_caron = [0x6D, 0x9D, 0x73, 0x74, 0x6F]; assert_eq!(
detect_encoding(&with_t_caron),
Encoding::Windows1250,
"Byte 0x9D (ť) should trigger Windows-1250 detection"
);
let with_z_acute_upper = [0x8F, 0x72, 0xF3, 0x64, 0xB3, 0x6F]; assert_eq!(
detect_encoding(&with_z_acute_upper),
Encoding::Windows1250,
"Byte 0x8F (Ź) should trigger Windows-1250 detection"
);
}
#[test]
fn test_detect_windows1250_strong_indicators() {
let polish_text = [
0x9C, 0x77, 0x69, 0x65, 0x74, 0x79, 0x20, 0x8C, 0x77, 0x69, 0x61, 0x74, ];
assert_eq!(
detect_encoding(&polish_text),
Encoding::Windows1250,
"Multiple Polish characters (ś, Ś) should trigger Windows-1250"
);
}
#[test]
fn test_detect_ambiguous_bytes_as_windows1252() {
let zolc = [0xBF, 0xF3, 0xB3, 0xE6];
assert_eq!(
detect_encoding(&zolc),
Encoding::Windows1252,
"Ambiguous bytes should default to Windows-1252"
);
let ambiguous = [
0x6D, 0xB9, 0x6B, 0x61, 0x20, 0x6D, 0xB3, 0x6F, 0x64, 0x79, ];
assert_eq!(
detect_encoding(&ambiguous),
Encoding::Windows1252,
"Ambiguous Polish bytes should default to Windows-1252"
);
}
#[test]
fn test_detect_windows1250_czech_pangram() {
let czech_pangram: &[u8] = &[
0x50, 0xF8, 0xED, 0x6C, 0x69, 0x9A, 0x20, 0x9E, 0x6C, 0x75, 0x9D, 0x6F, 0x75, 0xE8, 0x6B, 0xFD, 0x20, 0x6B, 0xF9, 0xF2, 0x20, 0xFA, 0x70, 0xEC, 0x6C, 0x20, 0xEF, 0xE1, 0x62, 0x65, 0x6C, 0x73, 0x6B, 0xE9, 0x20, 0xF3, 0x64, 0x79, ];
assert_eq!(
detect_encoding(czech_pangram),
Encoding::Windows1250,
"Czech pangram should be detected as Windows-1250 (contains ť = 0x9D)"
);
}
#[test]
fn test_detect_windows1252_not_1250() {
let windows1252_text = [
0x43, 0x61, 0x66, 0xE9, 0x20, 0x72, 0xE9, 0x73, 0x75, 0x6D, 0xE9, ];
assert_eq!(
detect_encoding(&windows1252_text),
Encoding::Windows1252,
"French text should remain Windows-1252"
);
}
#[test]
fn test_detect_utf8_chinese_truncated_sequence() {
let utf8_chinese_truncated = [
0xE6, 0x9B, 0xB4, 0xE5, 0xA4, 0x9A, 0xE5, ];
assert_eq!(
detect_encoding(&utf8_chinese_truncated),
Encoding::Utf8,
"Truncated UTF-8 Chinese text should be detected as UTF-8"
);
let utf8_chinese_truncated_2 = [
0xE6, 0x9B, 0xB4, 0xE5, 0xA4, 0x9A, 0xE5, 0xA4, ];
assert_eq!(
detect_encoding(&utf8_chinese_truncated_2),
Encoding::Utf8,
"Truncated UTF-8 with 2-byte incomplete sequence should be detected as UTF-8"
);
}
#[test]
fn test_detect_utf8_chinese_with_high_bytes() {
let chinese_text = "更多全本全集精校小说"; let bytes = chinese_text.as_bytes();
assert_eq!(
detect_encoding(bytes),
Encoding::Utf8,
"UTF-8 Chinese text should be detected as UTF-8, not Windows-1250"
);
let has_high_continuation_bytes = bytes.iter().any(|&b| (0x80..0xA0).contains(&b));
assert!(
has_high_continuation_bytes,
"Test should include bytes that could be mistaken for Windows-1250 indicators"
);
}
#[test]
fn test_detect_utf8_sample_truncation_at_boundary() {
let chinese = "我的美女老师"; let mut buffer = Vec::new();
for _ in 0..100 {
buffer.extend_from_slice(chinese.as_bytes());
}
assert!(std::str::from_utf8(&buffer).is_ok());
assert_eq!(detect_encoding(&buffer), Encoding::Utf8);
for truncate_offset in 1..=3 {
let truncated_len = buffer.len() - truncate_offset;
let truncated = &buffer[..truncated_len];
let is_strict_valid = std::str::from_utf8(truncated).is_ok();
let detected = detect_encoding(truncated);
assert_eq!(
detected,
Encoding::Utf8,
"Truncated UTF-8 at offset -{} should be detected as UTF-8, strict_valid={}",
truncate_offset,
is_strict_valid
);
}
}
}