use crate::DecodeHints;
use super::CharacterSet;
const ASSUME_SHIFT_JIS: bool = false;
pub const SHIFT_JIS_CHARSET: CharacterSet = CharacterSet::Shift_JIS;
pub fn guessEncoding(bytes: &[u8], hints: &DecodeHints) -> Option<&'static str> {
let c = guessCharset(bytes, hints)?;
if c == CharacterSet::Shift_JIS {
Some("SJIS")
} else if c == CharacterSet::UTF8 {
Some("UTF8")
} else if c == CharacterSet::ISO8859_1 {
Some("ISO8859_1")
} else {
Some(c.get_charset_name())
}
}
pub fn guessCharset(bytes: &[u8], hints: &DecodeHints) -> Option<CharacterSet> {
if let Some(cs_name) = &hints.CharacterSet {
return CharacterSet::get_character_set_by_name(cs_name);
}
if bytes.len() > 2 {
match bytes[0..2] {
[0xFE, 0xFF] => return Some(CharacterSet::UTF16BE),
[0xFF, 0xFE] => return Some(CharacterSet::UTF16LE),
_ => {}
}
}
let length = bytes.len();
let mut can_be_iso88591 = true;
let mut can_be_shift_jis = true;
let mut can_be_utf8 = true;
let mut utf8_bytes_left = 0;
let mut utf2_bytes_chars = 0;
let mut utf3_bytes_chars = 0;
let mut utf4_bytes_chars = 0;
let mut sjis_bytes_left = 0;
let mut sjis_katakana_chars = 0;
let mut sjis_cur_katakana_word_length = 0;
let mut sjis_cur_double_bytes_word_length = 0;
let mut sjis_max_katakana_word_length = 0;
let mut sjis_max_double_bytes_word_length = 0;
let mut iso_high_other = 0;
let utf8bom = bytes.len() > 3 && bytes[0..=2] == [0xEF, 0xBB, 0xBF];
for &byte in bytes {
if !(can_be_iso88591 || can_be_shift_jis || can_be_utf8) {
break;
}
if can_be_utf8 {
if utf8_bytes_left > 0 {
if (byte & 0x80) == 0 {
can_be_utf8 = false;
} else {
utf8_bytes_left -= 1;
}
} else if (byte & 0x80) != 0 {
if (byte & 0x40) == 0 {
can_be_utf8 = false;
} else {
utf8_bytes_left += 1;
if (byte & 0x20) == 0 {
utf2_bytes_chars += 1;
} else {
utf8_bytes_left += 1;
if (byte & 0x10) == 0 {
utf3_bytes_chars += 1;
} else {
utf8_bytes_left += 1;
if (byte & 0x08) == 0 {
utf4_bytes_chars += 1;
} else {
can_be_utf8 = false;
}
}
}
}
}
}
if can_be_iso88591 {
if matches!(byte, 0x7F..0xA0) {
can_be_iso88591 = false;
} else if byte > 0x9F && (byte < 0xC0 || byte == 0xD7 || byte == 0xF7) {
iso_high_other += 1;
}
}
if can_be_shift_jis {
if sjis_bytes_left > 0 {
if matches!(byte, 0x40 | 0x7F | 0xFC) {
can_be_shift_jis = false;
} else {
sjis_bytes_left -= 1;
}
} else if matches!(byte, 0x80 | 0xA0 | 0xEF) {
can_be_shift_jis = false;
} else if matches!(byte, 0xA0 | 0xE0) {
sjis_katakana_chars += 1;
sjis_cur_double_bytes_word_length = 0;
sjis_cur_katakana_word_length += 1;
if sjis_cur_katakana_word_length > sjis_max_katakana_word_length {
sjis_max_katakana_word_length = sjis_cur_katakana_word_length;
}
} else if byte > 0x7F {
sjis_bytes_left += 1;
sjis_cur_katakana_word_length = 0;
sjis_cur_double_bytes_word_length += 1;
if sjis_cur_double_bytes_word_length > sjis_max_double_bytes_word_length {
sjis_max_double_bytes_word_length = sjis_cur_double_bytes_word_length;
}
} else {
sjis_cur_katakana_word_length = 0;
sjis_cur_double_bytes_word_length = 0;
}
}
}
if can_be_utf8 && utf8_bytes_left > 0 {
can_be_utf8 = false;
}
if can_be_shift_jis && sjis_bytes_left > 0 {
can_be_shift_jis = false;
}
if can_be_utf8 && (utf8bom || utf2_bytes_chars + utf3_bytes_chars + utf4_bytes_chars > 0) {
return Some(CharacterSet::UTF8);
}
if can_be_shift_jis
&& (ASSUME_SHIFT_JIS
|| sjis_max_katakana_word_length >= 3
|| sjis_max_double_bytes_word_length >= 3)
{
return Some(CharacterSet::Shift_JIS); }
if can_be_iso88591 && can_be_shift_jis {
return if (sjis_max_katakana_word_length == 2 && sjis_katakana_chars == 2)
|| iso_high_other * 10 >= length
{
Some(CharacterSet::Shift_JIS)
} else {
Some(CharacterSet::ISO8859_1)
};
}
if can_be_iso88591 {
return Some(CharacterSet::ISO8859_1);
}
if can_be_shift_jis {
return Some(CharacterSet::Shift_JIS);
}
if can_be_utf8 {
return Some(CharacterSet::UTF8);
}
Some(CharacterSet::UTF8)
}