pub use chardet::*;
pub use encoding_rs::*;
pub static DEFAULT_ENCODE_LABEL_LIST: [&'static Encoding; 3] = [UTF_8, GBK, GB18030];
pub fn decode<'a>(content: &[u8], decode_labels: &'a [&'static Encoding]) -> Option<String> {
for &encoding in decode_labels {
let (decoded, _, had_errors) = encoding.decode(content);
if !had_errors && !decoded.contains('\u{FFFD}') {
return Some(decoded.into_owned());
}
}
None
}
pub fn auto_decode(content: &[u8]) -> Option<String> {
let mut labels: Vec<&'static Encoding> = DEFAULT_ENCODE_LABEL_LIST.to_vec();
let detected = detect(content);
let detected_charset = detected.0.as_str().to_lowercase();
if let Some(encoding) = encoding_rs::Encoding::for_label(detected_charset.as_bytes()) {
labels.push(encoding);
}
decode(content, &labels)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_auto_decode() {
let stdout = b"out -> \xe4\xbd\xa0\xe5\xa5\xbd: \xe6\x88\x91\xe4\xb8\x8d\xe8\x83\xbd\xe8\xae\xa4\xe8\xaf\x86\xe8\xaf\xad\xe8\xa8\x80\r\n\r\n Windows \xe6\x9c\xaa\xe6\xbf\x80\xe6\xb4\xbb";
let expected = "out -> 你好: 我不能认识语言\r\n\r\n Windows 未激活";
let decoded = auto_decode(stdout).unwrap_or(String::from_utf8_lossy(stdout).to_string());
assert_eq!(decoded, expected);
}
#[test]
fn test_auto_decode_err() {
const HEX_DATA: &[u8] = &[
0x20, 0x1E, 0x20, 0x22, 0x2D, 0x30, 0x38, 0x2D, 0x30, 0x32, 0x5F, 0x31, 0x36, 0x3A, 0x32,
0x34, 0x3A, 0x33, 0x37, 0x20, 0x30, 0x3A, 0x30, 0x30, 0x3A, 0x30, 0x30, 0x2E, 0x36, 0x35,
0x35, 0x37, 0x31, 0x31, 0x20, 0x5B, 0x43, 0x4C, 0x45, 0x41, 0x4E, 0x5D, 0x20, 0x49, 0x4E,
0x46, 0x4F, 0x20, 0xE9, 0x90, 0x97, 0xE5, 0xA0, 0x9F, 0xE6, 0xB9, 0xB0, 0xE8, 0xA4, 0xB0,
0xE6, 0x92, 0xB3, 0xE5, 0xA2, 0xA0, 0xE7, 0xBB, 0x8B, 0xE5, 0xAC, 0xAA, 0xE7, 0xB0, 0xAD,
0xE9, 0x90, 0x97, 0xE5, 0xA0, 0x9F, 0xE6, 0xB9, 0xB0, 0xE9, 0x94, 0x9B, 0x3F, 0x68, 0x67,
0x2D, 0x61, 0x75, 0x74, 0x6F, 0x74, 0x65, 0x74, 0x2D, 0x33, 0x2E, 0x30, 0x2E, 0x32, 0x0D,
0x0A, 0x32, 0x30, 0x32, 0x34, 0x2D, 0x30, 0x38, 0x2D, 0x30, 0x32, 0x5F, 0x31, 0x36, 0x3A,
0x32, 0x34, 0x3A, 0x33, 0x37, 0x20, 0x30, 0x3A, 0x30, 0x30, 0x3A, 0x30, 0x30, 0x2E, 0x36,
0x35, 0x36, 0x32, 0x30, 0x37, 0x20, 0x5B, 0x43, 0x4C, 0x45, 0x41, 0x4E, 0x5D, 0x20, 0x49,
0x4E, 0x46, 0x4F, 0x20, 0xE7, 0xBB, 0xAF, 0xE8, 0x8D, 0xA4, 0xE7, 0xB2, 0xBA, 0xE8, 0xA4,
0xB0, 0xE6, 0x92, 0xB3, 0xE5, 0xA2, 0xA0, 0xE7, 0xBB, 0xAF, 0xE8, 0x8D, 0xA4, 0xE7, 0xB2,
0xBA, 0xE9, 0x90, 0x9C, 0xEE, 0x88, 0x9A, 0xEE, 0x95, 0xA8, 0xE9, 0x94, 0x9B, 0x3F, 0x63,
0x70, 0x39, 0x33, 0x36, 0x20, 0x54, 0x72, 0x75, 0x65, 0x0D, 0x0A, 0x32, 0x30, 0x32, 0x34,
0x2D, 0x30, 0x38, 0x2D, 0x30, 0x32, 0x5F, 0x31, 0x36, 0x3A, 0x32, 0x34, 0x3A, 0x33, 0x38,
0x20, 0x30, 0x3A, 0x30, 0x30, 0x3A, 0x30, 0x31, 0x2E, 0x36, 0x31, 0x31, 0x35, 0x30, 0x33,
0x20, 0x5B, 0x43, 0x4C, 0x45, 0x41, 0x4E, 0x5D, 0x20, 0x49, 0x4E, 0x46, 0x4F, 0x20, 0xE9,
0x8D, 0xA5, 0xE6, 0x83, 0xA7, 0xE8, 0x88, 0xB0, 0xE9, 0x90, 0xA3, 0xE5, 0xB2, 0x84, 0xE6,
0xBD, 0xB0, 0x3D, 0xE5, 0xAF, 0xAE, 0xE2, 0x82, 0xAC, 0xE6, 0xBF, 0xAE, 0xE5, 0xAC, 0xAB,
0xE5, 0xA2, 0xBD, 0xE7, 0x90, 0x9B, 0xE5, 0xB1, 0xBC, 0xE6, 0x8D, 0xA2, 0xE9, 0x8D, 0x94,
0x3F, 0x0D, 0x0A, 0x32, 0x30, 0x32, 0x34, 0x2D, 0x30, 0x38, 0x2D, 0x30, 0x32, 0x5F, 0x31,
0x36, 0x3A, 0x32, 0x34, 0x3A, 0x33, 0x38, 0x20, 0x30, 0x3A, 0x30, 0x30, 0x3A, 0x30, 0x31,
0x2E, 0x36, 0x35, 0x31, 0x36, 0x37, 0x39, 0x20, 0x5B, 0x43, 0x4C, 0x45, 0x41, 0x4E, 0x5D,
0x20, 0x57, 0x41, 0x52, 0x4E, 0x49, 0x4E, 0x47, 0x20, 0x5B, 0xE7, 0xBB, 0xAF, 0xE8, 0x8D,
0xA4, 0xE7, 0xB2, 0xBA, 0xE9, 0x90, 0x97, 0xE5, 0xA0, 0x9F, 0xE6, 0xB9, 0xB0, 0xE5, 0xA6,
0xAB, 0xE2, 0x82, 0xAC, 0xE9, 0x8F, 0x8C, 0xEE, 0x99, 0xA3, 0x43, 0x49, 0x53, 0x2D, 0x58,
0x53, 0x4B, 0x4C, 0x2D, 0x4C, 0x57, 0x30, 0x31, 0x2D, 0x56, 0x31, 0x2E, 0x30, 0x2E, 0x30,
0x2D, 0x32, 0x30, 0x32, 0x34, 0x30, 0x33, 0x31, 0x39,
];
let _expected = "out ->";
let decoded = auto_decode(HEX_DATA).unwrap_or(String::from_utf8_lossy(HEX_DATA).to_string());
assert_eq!(decoded, decoded);
}
}