use encoding::all::{
GB18030, GBK, ISO_2022_JP, ISO_8859_1, ISO_8859_2, ISO_8859_3, ISO_8859_4, ISO_8859_5,
ISO_8859_6, ISO_8859_7, ISO_8859_8, UTF_8, WINDOWS_31J, WINDOWS_874, WINDOWS_949,
};
use encoding::{DecoderTrap, EncoderTrap, Encoding, RawDecoder, StringWriter};
use snafu::{Backtrace, Snafu};
use std::borrow::Cow;
use std::fmt::Debug;
#[derive(Debug, Snafu)]
#[non_exhaustive]
pub enum EncodeTextError {
#[snafu(display("{}", message))]
EncodeCustom {
message: Cow<'static, str>,
backtrace: Backtrace,
},
}
#[derive(Debug, Snafu)]
#[non_exhaustive]
pub enum DecodeTextError {
#[snafu(display("{}", message))]
DecodeCustom {
message: Cow<'static, str>,
backtrace: Backtrace,
},
}
type EncodeResult<T> = Result<T, EncodeTextError>;
type DecodeResult<T> = Result<T, DecodeTextError>;
pub trait TextCodec {
fn name(&self) -> Cow<'static, str>;
fn decode(&self, text: &[u8]) -> DecodeResult<String>;
fn encode(&self, text: &str) -> EncodeResult<Vec<u8>>;
}
impl<T: ?Sized> TextCodec for Box<T>
where
T: TextCodec,
{
fn name(&self) -> Cow<'static, str> {
self.as_ref().name()
}
fn decode(&self, text: &[u8]) -> DecodeResult<String> {
self.as_ref().decode(text)
}
fn encode(&self, text: &str) -> EncodeResult<Vec<u8>> {
self.as_ref().encode(text)
}
}
impl<T: ?Sized> TextCodec for &'_ T
where
T: TextCodec,
{
fn name(&self) -> Cow<'static, str> {
(**self).name()
}
fn decode(&self, text: &[u8]) -> DecodeResult<String> {
(**self).decode(text)
}
fn encode(&self, text: &str) -> EncodeResult<Vec<u8>> {
(**self).encode(text)
}
}
#[derive(Debug, Default, Clone, PartialEq)]
pub struct SpecificCharacterSet(CharsetImpl);
impl SpecificCharacterSet {
pub const ISO_IR_6: SpecificCharacterSet = SpecificCharacterSet(CharsetImpl::Default);
pub const ISO_IR_100: SpecificCharacterSet = SpecificCharacterSet(CharsetImpl::IsoIr100);
pub const ISO_IR_192: SpecificCharacterSet = SpecificCharacterSet(CharsetImpl::IsoIr192);
pub fn from_code(code: &str) -> Option<Self> {
CharsetImpl::from_code(code).map(SpecificCharacterSet)
}
}
impl TextCodec for SpecificCharacterSet {
fn name(&self) -> Cow<'static, str> {
self.0.name()
}
fn decode(&self, text: &[u8]) -> DecodeResult<String> {
self.0.decode(text)
}
fn encode(&self, text: &str) -> EncodeResult<Vec<u8>> {
self.0.encode(text)
}
}
#[derive(Debug, Default, Copy, Clone, Eq, PartialEq, PartialOrd, Ord)]
#[non_exhaustive]
enum CharsetImpl {
#[default]
Default,
IsoIr13,
IsoIr87,
IsoIr100,
IsoIr101,
IsoIr109,
IsoIr110,
IsoIr126,
IsoIr127,
IsoIr138,
IsoIr144,
IsoIr149,
IsoIr166,
IsoIr192,
Gb18030,
Gbk,
}
impl CharsetImpl {
pub fn from_code(uid: &str) -> Option<Self> {
use self::CharsetImpl::*;
match uid.trim_end() {
"Default" | "ISO_IR_6" | "ISO_IR 6" | "ISO 2022 IR 6" => Some(Default),
"ISO_IR_13" | "ISO_IR 13" | "ISO 2022 IR 13" => Some(IsoIr13),
"ISO_IR_87" | "ISO_IR 87" | "ISO 2022 IR 87" => Some(IsoIr87),
"ISO_IR_100" | "ISO_IR 100" | "ISO 2022 IR 100" => Some(IsoIr100),
"ISO_IR_101" | "ISO_IR 101" | "ISO 2022 IR 101" => Some(IsoIr101),
"ISO_IR_109" | "ISO_IR 109" | "ISO 2022 IR 109" => Some(IsoIr109),
"ISO_IR_110" | "ISO_IR 110" | "ISO 2022 IR 110" => Some(IsoIr110),
"ISO_IR_126" | "ISO_IR 126" | "ISO 2022 IR 126" => Some(IsoIr126),
"ISO_IR_127" | "ISO_IR 127" | "ISO 2022 IR 127" => Some(IsoIr127),
"ISO_IR_138" | "ISO_IR 138" | "ISO 2022 IR 138" => Some(IsoIr138),
"ISO_IR_144" | "ISO_IR 144" | "ISO 2022 IR 144" => Some(IsoIr144),
"ISO_IR_149" | "ISO_IR 149" | "ISO 2022 IR 149" => Some(IsoIr149),
"ISO_IR_166" | "ISO_IR 166" | "ISO 2022 IR 166" => Some(IsoIr166),
"ISO_IR_192" | "ISO_IR 192" => Some(IsoIr192),
"GB18030" => Some(Gb18030),
"GBK" | "GB2312" | "ISO 2022 IR 58" => Some(Gbk),
_ => None,
}
}
}
impl TextCodec for CharsetImpl {
fn name(&self) -> Cow<'static, str> {
Cow::Borrowed(match self {
CharsetImpl::Default => "ISO_IR 6",
CharsetImpl::IsoIr13 => "ISO_IR 13",
CharsetImpl::IsoIr87 => "ISO_IR 87",
CharsetImpl::IsoIr100 => "ISO_IR 100",
CharsetImpl::IsoIr101 => "ISO_IR 101",
CharsetImpl::IsoIr109 => "ISO_IR 109",
CharsetImpl::IsoIr110 => "ISO_IR 110",
CharsetImpl::IsoIr126 => "ISO_IR 126",
CharsetImpl::IsoIr127 => "ISO_IR 127",
CharsetImpl::IsoIr138 => "ISO_IR 138",
CharsetImpl::IsoIr144 => "ISO_IR 144",
CharsetImpl::IsoIr149 => "ISO_IR 149",
CharsetImpl::IsoIr166 => "ISO_IR 166",
CharsetImpl::IsoIr192 => "ISO_IR 192",
CharsetImpl::Gb18030 => "GB18030",
CharsetImpl::Gbk => "GBK",
})
}
fn decode(&self, text: &[u8]) -> DecodeResult<String> {
match self {
CharsetImpl::Default => DefaultCharacterSetCodec.decode(text),
CharsetImpl::IsoIr13 => IsoIr13CharacterSetCodec.decode(text),
CharsetImpl::IsoIr87 => IsoIr87CharacterSetCodec.decode(text),
CharsetImpl::IsoIr100 => IsoIr100CharacterSetCodec.decode(text),
CharsetImpl::IsoIr101 => IsoIr101CharacterSetCodec.decode(text),
CharsetImpl::IsoIr109 => IsoIr109CharacterSetCodec.decode(text),
CharsetImpl::IsoIr110 => IsoIr110CharacterSetCodec.decode(text),
CharsetImpl::IsoIr126 => IsoIr126CharacterSetCodec.decode(text),
CharsetImpl::IsoIr127 => IsoIr127CharacterSetCodec.decode(text),
CharsetImpl::IsoIr138 => IsoIr138CharacterSetCodec.decode(text),
CharsetImpl::IsoIr144 => IsoIr144CharacterSetCodec.decode(text),
CharsetImpl::IsoIr149 => IsoIr149CharacterSetCodec.decode(text),
CharsetImpl::IsoIr166 => IsoIr166CharacterSetCodec.decode(text),
CharsetImpl::IsoIr192 => Utf8CharacterSetCodec.decode(text),
CharsetImpl::Gb18030 => Gb18030CharacterSetCodec.decode(text),
CharsetImpl::Gbk => GBKCharacterSetCodec.decode(text),
}
}
fn encode(&self, text: &str) -> EncodeResult<Vec<u8>> {
match self {
CharsetImpl::Default => DefaultCharacterSetCodec.encode(text),
CharsetImpl::IsoIr13 => IsoIr13CharacterSetCodec.encode(text),
CharsetImpl::IsoIr87 => IsoIr87CharacterSetCodec.encode(text),
CharsetImpl::IsoIr100 => IsoIr100CharacterSetCodec.encode(text),
CharsetImpl::IsoIr101 => IsoIr101CharacterSetCodec.encode(text),
CharsetImpl::IsoIr109 => IsoIr109CharacterSetCodec.encode(text),
CharsetImpl::IsoIr110 => IsoIr110CharacterSetCodec.encode(text),
CharsetImpl::IsoIr126 => IsoIr126CharacterSetCodec.encode(text),
CharsetImpl::IsoIr127 => IsoIr127CharacterSetCodec.encode(text),
CharsetImpl::IsoIr138 => IsoIr138CharacterSetCodec.encode(text),
CharsetImpl::IsoIr144 => IsoIr144CharacterSetCodec.encode(text),
CharsetImpl::IsoIr149 => IsoIr149CharacterSetCodec.encode(text),
CharsetImpl::IsoIr166 => IsoIr166CharacterSetCodec.encode(text),
CharsetImpl::IsoIr192 => Utf8CharacterSetCodec.encode(text),
CharsetImpl::Gb18030 => Gb18030CharacterSetCodec.encode(text),
CharsetImpl::Gbk => GBKCharacterSetCodec.encode(text),
}
}
}
fn decode_text_trap(
_decoder: &mut dyn RawDecoder,
input: &[u8],
output: &mut dyn StringWriter,
) -> bool {
let c = input[0];
let o0 = c & 7;
let o1 = (c & 56) >> 3;
let o2 = (c & 192) >> 6;
output.write_char('\\');
output.write_char((o2 + b'0') as char);
output.write_char((o1 + b'0') as char);
output.write_char((o0 + b'0') as char);
true
}
macro_rules! decl_character_set {
($typ: ident, $term: literal, $val: expr) => {
#[derive(Debug, Default, Copy, Clone, Eq, Hash, PartialEq)]
#[doc = "Data type for the "]
#[doc = $term]
#[doc = "character set encoding."]
pub struct $typ;
impl TextCodec for $typ {
fn name(&self) -> Cow<'static, str> {
Cow::Borrowed($term)
}
fn decode(&self, text: &[u8]) -> DecodeResult<String> {
$val.decode(text, DecoderTrap::Call(decode_text_trap))
.map_err(|message| DecodeCustomSnafu { message }.build())
}
fn encode(&self, text: &str) -> EncodeResult<Vec<u8>> {
$val.encode(text, EncoderTrap::Strict)
.map_err(|message| EncodeCustomSnafu { message }.build())
}
}
};
}
#[derive(Debug, Default, Copy, Clone, Eq, Hash, PartialEq)]
pub struct DefaultCharacterSetCodec;
impl TextCodec for DefaultCharacterSetCodec {
fn name(&self) -> Cow<'static, str> {
Cow::Borrowed("ISO_IR 6")
}
fn decode(&self, text: &[u8]) -> DecodeResult<String> {
ISO_8859_1
.decode(text, DecoderTrap::Call(decode_text_trap))
.map_err(|message| DecodeCustomSnafu { message }.build())
}
fn encode(&self, text: &str) -> EncodeResult<Vec<u8>> {
ISO_8859_1
.encode(text, EncoderTrap::Strict)
.map_err(|message| EncodeCustomSnafu { message }.build())
}
}
decl_character_set!(IsoIr13CharacterSetCodec, "ISO_IR 13", WINDOWS_31J);
decl_character_set!(IsoIr87CharacterSetCodec, "ISO_IR 87", ISO_2022_JP);
decl_character_set!(IsoIr100CharacterSetCodec, "ISO_IR 100", ISO_8859_1);
decl_character_set!(IsoIr101CharacterSetCodec, "ISO_IR 101", ISO_8859_2);
decl_character_set!(IsoIr109CharacterSetCodec, "ISO_IR 109", ISO_8859_3);
decl_character_set!(IsoIr110CharacterSetCodec, "ISO_IR 110", ISO_8859_4);
decl_character_set!(IsoIr126CharacterSetCodec, "ISO_IR 126", ISO_8859_7);
decl_character_set!(IsoIr127CharacterSetCodec, "ISO_IR 127", ISO_8859_6);
decl_character_set!(IsoIr138CharacterSetCodec, "ISO_IR 138", ISO_8859_8);
decl_character_set!(IsoIr144CharacterSetCodec, "ISO_IR 144", ISO_8859_5);
decl_character_set!(IsoIr149CharacterSetCodec, "ISO_IR 149", WINDOWS_949);
decl_character_set!(IsoIr166CharacterSetCodec, "ISO_IR 166", WINDOWS_874);
decl_character_set!(Utf8CharacterSetCodec, "ISO_IR 192", UTF_8);
decl_character_set!(Gb18030CharacterSetCodec, "GB18030", GB18030);
decl_character_set!(GBKCharacterSetCodec, "GBK", GBK);
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
pub enum TextValidationOutcome {
Ok,
BadCharacters,
NotOk,
}
pub fn validate_iso_8859(text: &[u8]) -> TextValidationOutcome {
if ISO_8859_1.decode(text, DecoderTrap::Strict).is_err() {
match ISO_8859_1.decode(text, DecoderTrap::Call(decode_text_trap)) {
Ok(_) => TextValidationOutcome::BadCharacters,
Err(_) => TextValidationOutcome::NotOk,
}
} else {
TextValidationOutcome::Ok
}
}
pub fn validate_da(text: &[u8]) -> TextValidationOutcome {
if text.iter().cloned().all(|c| c.is_ascii_digit()) {
TextValidationOutcome::Ok
} else {
TextValidationOutcome::NotOk
}
}
pub fn validate_tm(text: &[u8]) -> TextValidationOutcome {
if text.iter().cloned().all(|c| match c {
b'\\' | b'.' | b'-' | b' ' => true,
c => c.is_ascii_digit(),
}) {
TextValidationOutcome::Ok
} else {
TextValidationOutcome::NotOk
}
}
pub fn validate_dt(text: &[u8]) -> TextValidationOutcome {
if text.iter().cloned().all(|c| match c {
b'.' | b'-' | b'+' | b' ' | b'\\' => true,
c => c.is_ascii_digit(),
}) {
TextValidationOutcome::Ok
} else {
TextValidationOutcome::NotOk
}
}
pub fn validate_cs(text: &[u8]) -> TextValidationOutcome {
if text.iter().cloned().all(|c| match c {
b' ' | b'_' => true,
c => c.is_ascii_digit() || c.is_ascii_uppercase(),
}) {
TextValidationOutcome::Ok
} else {
TextValidationOutcome::NotOk
}
}
#[cfg(test)]
mod tests {
use super::*;
fn test_codec<T>(codec: T, string: &str, bytes: &[u8])
where
T: TextCodec,
{
assert_eq!(codec.encode(string).expect("encoding"), bytes);
assert_eq!(codec.decode(bytes).expect("decoding"), string);
}
#[test]
fn iso_ir_6_baseline() {
let codec = SpecificCharacterSet::default();
test_codec(codec, "Smith^John", b"Smith^John");
}
#[test]
fn iso_ir_13_baseline() {
let codec = SpecificCharacterSet(CharsetImpl::IsoIr13);
test_codec(codec, "ヤマダ^タロウ", b"\xd4\xcf\xc0\xde^\xc0\xdb\xb3");
}
#[test]
fn iso_ir_87_baseline() {
let codec = SpecificCharacterSet(CharsetImpl::IsoIr87);
test_codec(&codec, "山田^太郎", b"\x1b$B;3ED\x1b(B^\x1b$BB@O:");
test_codec(&codec, "やまだ^たろう", b"\x1b$B$d$^$@\x1b(B^\x1b$B$?$m$&");
}
#[test]
fn iso_ir_192_baseline() {
let codec = SpecificCharacterSet::ISO_IR_192;
test_codec(&codec, "Simões^John", "Simões^John".as_bytes());
test_codec(codec, "Иванков^Андрей", "Иванков^Андрей".as_bytes());
}
#[test]
fn iso_ir_100_baseline() {
let codec = SpecificCharacterSet(CharsetImpl::IsoIr100);
test_codec(&codec, "Simões^João", b"Sim\xF5es^Jo\xE3o");
test_codec(codec, "Günther^Hans", b"G\xfcnther^Hans");
}
#[test]
fn iso_ir_101_baseline() {
let codec = SpecificCharacterSet(CharsetImpl::IsoIr101);
test_codec(codec, "Günther^Hans", b"G\xfcnther^Hans");
}
#[test]
fn iso_ir_110_baseline() {
let codec = SpecificCharacterSet(CharsetImpl::IsoIr110);
test_codec(codec, "ĄĸŖĨϧŠĒĢŦŽĀÁÂÃÄÅÆĮČÉ^ĘËĖÍÎĪĐŅŌĶÔÕÖרŲÚÛÜŨŪß", b"\xA1\xA2\xA3\xA5\xA6\xA7\xA9\xAA\xAB\xAC\xAE\xC0\xC1\xC2\xC3\xC4\xC5\xC6\xC7\xC8\xC9^\xCA\xCB\xCC\xCD\xCE\xCF\xD0\xD1\xD2\xD3\xD4\xD5\xD6\xD7\xD8\xD9\xDA\xDB\xDC\xDD\xDE\xDF");
}
#[test]
fn iso_ir_126_baseline() {
let codec = SpecificCharacterSet(CharsetImpl::IsoIr126);
test_codec(codec, "Διονυσιος", b"\xC4\xE9\xEF\xED\xF5\xF3\xE9\xEF\xF2");
}
#[test]
fn iso_ir_127_baseline() {
let codec = SpecificCharacterSet(CharsetImpl::IsoIr127);
test_codec(
codec,
"قباني^لنزار",
b"\xE2\xC8\xC7\xE6\xEA^\xE4\xE6\xD2\xC7\xD1",
);
}
#[test]
fn iso_ir_138_baseline() {
let codec = SpecificCharacterSet(CharsetImpl::IsoIr138);
test_codec(
&codec,
"מקור השם עברית",
b"\xEE\xF7\xE5\xF8\x20\xE4\xF9\xED\x20\xF2\xE1\xF8\xE9\xFA",
);
test_codec(
codec,
"שרון^דבורה",
b"\xF9\xF8\xE5\xEF^\xE3\xE1\xE5\xF8\xE4",
);
}
#[test]
fn iso_ir_144_baseline() {
let codec = SpecificCharacterSet(CharsetImpl::IsoIr144);
test_codec(
&codec,
"Иванков^Андрей",
b"\xb8\xd2\xd0\xdd\xda\xde\xd2^\xb0\xdd\xd4\xe0\xd5\xd9",
);
test_codec(
&codec,
"Гол. мозг стандарт",
b"\xB3\xDE\xDB.\x20\xDC\xDE\xD7\xD3\x20\xE1\xE2\xD0\xDD\xD4\xD0\xE0\xE2",
);
test_codec(&codec, "мозг 2мм", b"\xDC\xDE\xD7\xD3\x202\xDC\xDC");
}
#[test]
fn iso_ir_149_baseline() {
let codec = SpecificCharacterSet(CharsetImpl::IsoIr149);
test_codec(&codec, "김희중", b"\xB1\xE8\xC8\xF1\xC1\xDF");
test_codec(
codec,
"Hong^Gildong=洪^吉洞=홍^길동",
b"Hong^Gildong=\xFB\xF3^\xD1\xCE\xD4\xD7=\xC8\xAB^\xB1\xE6\xB5\xBF",
);
}
#[test]
fn iso_ir_166_baseline() {
let codec = SpecificCharacterSet(CharsetImpl::IsoIr166);
test_codec(&codec, "ประเทศไทย", b"\xBB\xC3\xD0\xE0\xB7\xC8\xE4\xB7\xC2");
test_codec(codec, "รหัสสำหรับอักขระไทยที่ใช้กับคอมพิวเตอร์", b"\xC3\xCB\xD1\xCA\xCA\xD3\xCB\xC3\xD1\xBA\xCD\xD1\xA1\xA2\xC3\xD0\xE4\xB7\xC2\xB7\xD5\xE8\xE3\xAA\xE9\xA1\xD1\xBA\xA4\xCD\xC1\xBE\xD4\xC7\xE0\xB5\xCD\xC3\xEC");
}
#[test]
fn gb_18030_baseline() {
let codec = SpecificCharacterSet(CharsetImpl::Gb18030);
test_codec(
&codec,
"Wang^XiaoDong=王^小东",
b"Wang^XiaoDong=\xCD\xF5^\xD0\xA1\xB6\xAB",
);
}
#[test]
fn gb_gbk_baseline() {
let codec = SpecificCharacterSet(CharsetImpl::Gbk);
let iso2022_ir58_bytes = vec![
0xB0, 0xB2, 0xBB, 0xD5, 0xD0, 0xC7, 0xC1, 0xE9, 0xD0, 0xC5, 0xCF, 0xA2, 0xBF, 0xC6,
0xBC, 0xBC, 0xD3, 0xD0, 0xCF, 0xDE, 0xB9, 0xAB, 0xCB, 0xBE,
];
let rw = codec.decode(&iso2022_ir58_bytes).expect("decoding");
assert_eq!(rw, "安徽星灵信息科技有限公司");
let gb2312_bytes = vec![
0xCA, 0xB9, 0xC6, 0xE4, 0xD3, 0xEB, 0xD4, 0xAD, 0xCA, 0xBC, 0xB2, 0xD6, 0xBF, 0xE2,
0xB1, 0xA3, 0xB3, 0xD6, 0xD2, 0xBB, 0xD6, 0xC2,
];
let rw2 = codec.decode(&gb2312_bytes).expect("decoding");
assert_eq!(rw2, "使其与原始仓库保持一致");
}
}