use crate::{DecodeHintType, DecodeHintValue, DecodingHintDictionary};
use super::CharacterSet;
pub struct StringUtils {
}
const ASSUME_SHIFT_JIS: bool = false;
pub static SHIFT_JIS_CHARSET: CharacterSet = CharacterSet::Shift_JIS;
impl StringUtils {
pub fn guessEncoding(bytes: &[u8], hints: &DecodingHintDictionary) -> Option<&'static str> {
let c = StringUtils::guessCharset(bytes, hints)?;
if c == CharacterSet::Shift_JIS {
Some("SJIS")
} else if c == CharacterSet::UTF8 {
Some("UTF8")
} else if c == CharacterSet::ISO8859_1 {
Some("ISO8859_1")
} else {
Some(c.get_charset_name())
}
}
pub fn guessCharset(bytes: &[u8], hints: &DecodingHintDictionary) -> Option<CharacterSet> {
if let Some(DecodeHintValue::CharacterSet(cs_name)) =
hints.get(&DecodeHintType::CHARACTER_SET)
{
return CharacterSet::get_character_set_by_name(cs_name);
}
if bytes.len() > 2
&& ((bytes[0] == 0xFE && bytes[1] == 0xFF) || (bytes[0] == 0xFF && bytes[1] == 0xFE))
{
if bytes[0] == 0xFE && bytes[1] == 0xFF {
return Some(CharacterSet::UTF16BE);
} else {
return Some(CharacterSet::UTF16LE);
}
}
let length = bytes.len();
let mut can_be_iso88591 = true;
let mut can_be_shift_jis = true;
let mut can_be_utf8 = true;
let mut utf8_bytes_left = 0;
let mut utf2_bytes_chars = 0;
let mut utf3_bytes_chars = 0;
let mut utf4_bytes_chars = 0;
let mut sjis_bytes_left = 0;
let mut sjis_katakana_chars = 0;
let mut sjis_cur_katakana_word_length = 0;
let mut sjis_cur_double_bytes_word_length = 0;
let mut sjis_max_katakana_word_length = 0;
let mut sjis_max_double_bytes_word_length = 0;
let mut iso_high_other = 0;
let utf8bom = bytes.len() > 3 && bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF;
for value in bytes.iter().take(length).copied() {
if !(can_be_iso88591 || can_be_shift_jis || can_be_utf8) {
break;
}
if can_be_utf8 {
if utf8_bytes_left > 0 {
if (value & 0x80) == 0 {
can_be_utf8 = false;
} else {
utf8_bytes_left -= 1;
}
} else if (value & 0x80) != 0 {
if (value & 0x40) == 0 {
can_be_utf8 = false;
} else {
utf8_bytes_left += 1;
if (value & 0x20) == 0 {
utf2_bytes_chars += 1;
} else {
utf8_bytes_left += 1;
if (value & 0x10) == 0 {
utf3_bytes_chars += 1;
} else {
utf8_bytes_left += 1;
if (value & 0x08) == 0 {
utf4_bytes_chars += 1;
} else {
can_be_utf8 = false;
}
}
}
}
}
}
if can_be_iso88591 {
if value > 0x7F && value < 0xA0 {
can_be_iso88591 = false;
} else if value > 0x9F && (value < 0xC0 || value == 0xD7 || value == 0xF7) {
iso_high_other += 1;
}
}
if can_be_shift_jis {
if sjis_bytes_left > 0 {
if value < 0x40 || value == 0x7F || value > 0xFC {
can_be_shift_jis = false;
} else {
sjis_bytes_left -= 1;
}
} else if value == 0x80 || value == 0xA0 || value > 0xEF {
can_be_shift_jis = false;
} else if value > 0xA0 && value < 0xE0 {
sjis_katakana_chars += 1;
sjis_cur_double_bytes_word_length = 0;
sjis_cur_katakana_word_length += 1;
if sjis_cur_katakana_word_length > sjis_max_katakana_word_length {
sjis_max_katakana_word_length = sjis_cur_katakana_word_length;
}
} else if value > 0x7F {
sjis_bytes_left += 1;
sjis_cur_katakana_word_length = 0;
sjis_cur_double_bytes_word_length += 1;
if sjis_cur_double_bytes_word_length > sjis_max_double_bytes_word_length {
sjis_max_double_bytes_word_length = sjis_cur_double_bytes_word_length;
}
} else {
sjis_cur_katakana_word_length = 0;
sjis_cur_double_bytes_word_length = 0;
}
}
}
if can_be_utf8 && utf8_bytes_left > 0 {
can_be_utf8 = false;
}
if can_be_shift_jis && sjis_bytes_left > 0 {
can_be_shift_jis = false;
}
if can_be_utf8 && (utf8bom || utf2_bytes_chars + utf3_bytes_chars + utf4_bytes_chars > 0) {
return Some(CharacterSet::UTF8);
}
if can_be_shift_jis
&& (ASSUME_SHIFT_JIS
|| sjis_max_katakana_word_length >= 3
|| sjis_max_double_bytes_word_length >= 3)
{
return Some(CharacterSet::Shift_JIS); }
if can_be_iso88591 && can_be_shift_jis {
return if (sjis_max_katakana_word_length == 2 && sjis_katakana_chars == 2)
|| iso_high_other * 10 >= length
{
Some(CharacterSet::Shift_JIS)
} else {
Some(CharacterSet::ISO8859_1)
};
}
if can_be_iso88591 {
return Some(CharacterSet::ISO8859_1);
}
if can_be_shift_jis {
return Some(CharacterSet::Shift_JIS);
}
if can_be_utf8 {
return Some(CharacterSet::UTF8);
}
Some(CharacterSet::UTF8)
}
}