use std::collections::HashMap;
use std::result::Result as StdResult;
use std::str;
use crate::traits::{DictError, Result};
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum TextEncoding {
Utf8,
Utf16Le,
Utf16Be,
Windows1252,
Iso88591,
Gb2312,
Big5,
ShiftJis,
EucKr,
Unknown,
}
impl TextEncoding {
pub fn name(&self) -> &'static str {
match self {
TextEncoding::Utf8 => "UTF-8",
TextEncoding::Utf16Le => "UTF-16LE",
TextEncoding::Utf16Be => "UTF-16BE",
TextEncoding::Windows1252 => "Windows-1252",
TextEncoding::Iso88591 => "ISO-8859-1",
TextEncoding::Gb2312 => "GB2312",
TextEncoding::Big5 => "Big5",
TextEncoding::ShiftJis => "Shift-JIS",
TextEncoding::EucKr => "EUC-KR",
TextEncoding::Unknown => "Unknown",
}
}
pub fn is_unicode(&self) -> bool {
matches!(
self,
TextEncoding::Utf8 | TextEncoding::Utf16Le | TextEncoding::Utf16Be
)
}
pub fn is_variable_width(&self) -> bool {
matches!(
self,
TextEncoding::Utf8
| TextEncoding::Gb2312
| TextEncoding::Big5
| TextEncoding::ShiftJis
| TextEncoding::EucKr
)
}
pub fn max_char_bytes(&self) -> usize {
match self {
TextEncoding::Utf8 => 4,
TextEncoding::Utf16Le | TextEncoding::Utf16Be => 2,
TextEncoding::Windows1252 | TextEncoding::Iso88591 => 1,
TextEncoding::Gb2312
| TextEncoding::Big5
| TextEncoding::ShiftJis
| TextEncoding::EucKr => 2,
TextEncoding::Unknown => 1,
}
}
}
pub fn detect_encoding(data: &[u8]) -> Result<TextEncoding> {
if data.is_empty() {
return Ok(TextEncoding::Unknown);
}
if let Some(encoding) = detect_bom(data) {
return Ok(encoding);
}
if is_valid_utf8(data) {
return Ok(TextEncoding::Utf8);
}
let mut scores = HashMap::new();
scores.insert(TextEncoding::Windows1252, score_windows1252(data));
scores.insert(TextEncoding::Gb2312, score_gb2312(data));
scores.insert(TextEncoding::Big5, score_big5(data));
scores.insert(TextEncoding::ShiftJis, score_shift_jis(data));
scores.insert(TextEncoding::EucKr, score_euc_kr(data));
let mut best_encoding = TextEncoding::Unknown;
let mut best_score = -1.0f32;
for (encoding, score) in scores {
if score > best_score {
best_encoding = encoding;
best_score = score;
}
}
if best_score < 0.1 {
if is_ascii_only(data) {
Ok(TextEncoding::Utf8)
} else {
Ok(TextEncoding::Windows1252)
}
} else {
Ok(best_encoding)
}
}
fn detect_bom(data: &[u8]) -> Option<TextEncoding> {
if data.len() >= 3 && data[0..3] == [0xEF, 0xBB, 0xBF] {
Some(TextEncoding::Utf8)
} else if data.len() >= 2 && data[0..2] == [0xFF, 0xFE] {
Some(TextEncoding::Utf16Le)
} else if data.len() >= 2 && data[0..2] == [0xFE, 0xFF] {
Some(TextEncoding::Utf16Be)
} else {
None
}
}
fn is_valid_utf8(data: &[u8]) -> bool {
let mut i = 0;
while i < data.len() {
let byte = data[i];
if byte & 0x80 == 0 {
i += 1;
} else if byte & 0xE0 == 0xC0 && i + 1 < data.len() && (data[i + 1] & 0xC0) == 0x80 {
i += 2;
} else if byte & 0xF0 == 0xE0
&& i + 2 < data.len()
&& (data[i + 1] & 0xC0) == 0x80
&& (data[i + 2] & 0xC0) == 0x80
{
i += 3;
} else if byte & 0xF8 == 0xF0
&& i + 3 < data.len()
&& (data[i + 1] & 0xC0) == 0x80
&& (data[i + 2] & 0xC0) == 0x80
&& (data[i + 3] & 0xC0) == 0x80
{
i += 4;
} else {
return false;
}
}
true
}
fn is_ascii_only(data: &[u8]) -> bool {
data.iter().all(|&byte| byte < 0x80)
}
fn score_windows1252(data: &[u8]) -> f32 {
let mut score = 0.0;
let mut printable_count = 0;
for &byte in data {
if byte < 0x20 {
score += 0.1;
} else if byte < 0x7F {
printable_count += 1;
score += 1.0;
} else if byte >= 0xA0 && byte < 0xFF {
score += 0.8;
} else {
score -= 0.5;
}
}
if printable_count > 0 {
score / (data.len() as f32)
} else {
0.0
}
}
fn score_gb2312(data: &[u8]) -> f32 {
let mut score = 0.0;
let mut i = 0;
while i < data.len() {
if data[i] < 0x80 {
if data[i] >= 0x20 && data[i] != 0x7F {
score += 1.0;
}
i += 1;
} else if i + 1 < data.len() {
let byte1 = data[i];
let byte2 = data[i + 1];
if (byte1 >= 0xA1 && byte1 <= 0xF7 && byte2 >= 0xA1 && byte2 <= 0xFE)
|| (byte1 >= 0xA8 && byte1 <= 0xA8 && byte2 >= 0xA1 && byte2 <= 0xFE)
|| (byte1 >= 0xA9 && byte1 <= 0xA9 && byte2 >= 0xA1 && byte2 <= 0xFE)
{
score += 2.0;
} else {
score -= 1.0;
}
i += 2;
} else {
score -= 0.5;
i += 1;
}
}
if data.len() > 0 {
score / (data.len() as f32 / 2.0)
} else {
0.0
}
}
fn score_big5(data: &[u8]) -> f32 {
let mut score = 0.0;
let mut i = 0;
while i < data.len() {
if data[i] < 0x80 {
if data[i] >= 0x20 && data[i] != 0x7F {
score += 1.0;
}
i += 1;
} else if i + 1 < data.len() {
let byte1 = data[i];
let byte2 = data[i + 1];
if ((byte1 >= 0xA1 && byte1 <= 0xFE)
&& (byte2 >= 0x40 && byte2 <= 0x7E || byte2 >= 0xA1 && byte2 <= 0xFE))
|| (byte1 == 0x87 && byte2 >= 0xA1 && byte2 <= 0xFE)
{
score += 2.0;
} else {
score -= 1.0;
}
i += 2;
} else {
score -= 0.5;
i += 1;
}
}
if data.len() > 0 {
score / (data.len() as f32 / 2.0)
} else {
0.0
}
}
fn score_shift_jis(data: &[u8]) -> f32 {
let mut score = 0.0;
let mut i = 0;
while i < data.len() {
if data[i] < 0x80 {
if data[i] >= 0x20 && data[i] != 0x7F {
score += 1.0;
}
i += 1;
} else if data[i] >= 0x81 && data[i] <= 0x9F || data[i] >= 0xE0 && data[i] <= 0xEF {
if i + 1 < data.len() {
let byte2 = data[i + 1];
if (byte2 >= 0x40 && byte2 <= 0x7E) || (byte2 >= 0x80 && byte2 <= 0xFC) {
score += 2.0;
} else {
score -= 1.0;
}
i += 2;
} else {
score -= 0.5;
i += 1;
}
} else {
score += 0.8;
i += 1;
}
}
if data.len() > 0 {
score / (data.len() as f32 / 2.0)
} else {
0.0
}
}
fn score_euc_kr(data: &[u8]) -> f32 {
let mut score = 0.0;
let mut i = 0;
while i < data.len() {
if data[i] < 0x80 {
if data[i] >= 0x20 && data[i] != 0x7F {
score += 1.0;
}
i += 1;
} else if i + 1 < data.len() {
let byte1 = data[i];
let byte2 = data[i + 1];
if byte1 >= 0xA1 && byte1 <= 0xFE && byte2 >= 0xA1 && byte2 <= 0xFE {
score += 2.0;
} else {
score -= 1.0;
}
i += 2;
} else {
score -= 0.5;
i += 1;
}
}
if data.len() > 0 {
score / (data.len() as f32 / 2.0)
} else {
0.0
}
}
pub fn convert_to_utf8(data: &[u8], from_encoding: TextEncoding) -> Result<String> {
match from_encoding {
TextEncoding::Utf8 => str::from_utf8(data)
.map_err(|e| DictError::Internal(format!("Invalid UTF-8: {}", e)))
.map(|s| s.to_string()),
TextEncoding::Windows1252 => convert_windows1252_to_utf8(data),
TextEncoding::Iso88591 => convert_iso88591_to_utf8(data),
TextEncoding::Gb2312 => convert_gb2312_to_utf8(data),
TextEncoding::Big5 => convert_big5_to_utf8(data),
TextEncoding::ShiftJis => convert_shift_jis_to_utf8(data),
TextEncoding::EucKr => convert_euc_kr_to_utf8(data),
TextEncoding::Utf16Le => convert_utf16le_to_utf8(data),
TextEncoding::Utf16Be => convert_utf16be_to_utf8(data),
TextEncoding::Unknown => {
if is_valid_utf8(data) {
convert_to_utf8(data, TextEncoding::Utf8)
} else {
convert_windows1252_to_utf8(data)
}
}
}
}
fn convert_windows1252_to_utf8(data: &[u8]) -> Result<String> {
let (cow, had_errors) = encoding_rs::WINDOWS_1252.decode_without_bom_handling(data);
if had_errors {
Err(DictError::Internal(
"Windows-1252 conversion produced replacement characters".to_string(),
))
} else {
Ok(cow.into_owned())
}
}
fn convert_iso88591_to_utf8(data: &[u8]) -> Result<String> {
let (cow, had_errors) = encoding_rs::WINDOWS_1252.decode_without_bom_handling(data);
if had_errors {
Err(DictError::Internal(
"ISO-8859-1 conversion produced replacement characters".to_string(),
))
} else {
Ok(cow.into_owned())
}
}
fn convert_utf16le_to_utf8(data: &[u8]) -> Result<String> {
if data.len() % 2 != 0 {
return Err(DictError::Internal(
"Invalid UTF-16LE data length".to_string(),
));
}
let mut u16s = Vec::with_capacity(data.len() / 2);
for chunk in data.chunks_exact(2) {
u16s.push(u16::from_le_bytes([chunk[0], chunk[1]]));
}
String::from_utf16(&u16s)
.map_err(|e| DictError::Internal(format!("Invalid UTF-16LE data: {e}")))
}
fn convert_utf16be_to_utf8(data: &[u8]) -> Result<String> {
if data.len() % 2 != 0 {
return Err(DictError::Internal(
"Invalid UTF-16BE data length".to_string(),
));
}
let mut u16s = Vec::with_capacity(data.len() / 2);
for chunk in data.chunks_exact(2) {
u16s.push(u16::from_be_bytes([chunk[0], chunk[1]]));
}
String::from_utf16(&u16s)
.map_err(|e| DictError::Internal(format!("Invalid UTF-16BE data: {e}")))
}
fn convert_gb2312_to_utf8(_data: &[u8]) -> Result<String> {
let (cow, _, had_errors) = encoding_rs::GBK.decode(_data);
if had_errors {
Err(DictError::Internal(
"GB2312/GBK conversion produced replacement characters".to_string(),
))
} else {
Ok(cow.into_owned())
}
}
fn convert_big5_to_utf8(_data: &[u8]) -> Result<String> {
let (cow, _, had_errors) = encoding_rs::BIG5.decode(_data);
if had_errors {
Err(DictError::Internal(
"Big5 conversion produced replacement characters".to_string(),
))
} else {
Ok(cow.into_owned())
}
}
fn convert_shift_jis_to_utf8(_data: &[u8]) -> Result<String> {
let (cow, _, had_errors) = encoding_rs::SHIFT_JIS.decode(_data);
if had_errors {
Err(DictError::Internal(
"Shift-JIS conversion produced replacement characters".to_string(),
))
} else {
Ok(cow.into_owned())
}
}
fn convert_euc_kr_to_utf8(_data: &[u8]) -> Result<String> {
let (cow, _, had_errors) = encoding_rs::EUC_KR.decode(_data);
if had_errors {
Err(DictError::Internal(
"EUC-KR conversion produced replacement characters".to_string(),
))
} else {
Ok(cow.into_owned())
}
}
pub fn is_valid_utf8_str(s: &str) -> bool {
s.bytes().all(|byte| {
byte < 0x80 || (byte & 0xE0) == 0xC0 || (byte & 0xF0) == 0xE0 || (byte & 0xF8) == 0xF0
})
}
pub fn get_encoding_stats(encoding: TextEncoding) -> EncodingStats {
match encoding {
TextEncoding::Utf8 => EncodingStats {
name: encoding.name(),
supports_unicode: true,
max_char_size: 4,
is_variable_width: true,
common_in: vec!["International", "Web", "Modern files"],
},
TextEncoding::Windows1252 => EncodingStats {
name: encoding.name(),
supports_unicode: false,
max_char_size: 1,
is_variable_width: false,
common_in: vec!["Windows", "Latin languages"],
},
TextEncoding::Iso88591 => EncodingStats {
name: encoding.name(),
supports_unicode: false,
max_char_size: 1,
is_variable_width: false,
common_in: vec!["Unix", "Latin languages", "Old systems"],
},
_ => EncodingStats {
name: encoding.name(),
supports_unicode: false,
max_char_size: 2,
is_variable_width: true,
common_in: vec!["Asian languages"],
},
}
}
#[derive(Debug, Clone)]
pub struct EncodingStats {
pub name: &'static str,
pub supports_unicode: bool,
pub max_char_size: usize,
pub is_variable_width: bool,
pub common_in: Vec<&'static str>,
}