1use encoding::all::{
30    GB18030, GBK, ISO_2022_JP, ISO_8859_1, ISO_8859_2, ISO_8859_3, ISO_8859_4, ISO_8859_5,
31    ISO_8859_6, ISO_8859_7, ISO_8859_8, UTF_8, WINDOWS_31J, WINDOWS_874, WINDOWS_949,
32};
33use encoding::{DecoderTrap, EncoderTrap, Encoding, RawDecoder, StringWriter};
34use snafu::{Backtrace, Snafu};
35use std::borrow::Cow;
36use std::fmt::Debug;
37
38#[derive(Debug, Snafu)]
40#[non_exhaustive]
41pub enum EncodeTextError {
42    #[snafu(display("{}", message))]
46    EncodeCustom {
47        message: Cow<'static, str>,
49        backtrace: Backtrace,
51    },
52}
53
54#[derive(Debug, Snafu)]
56#[non_exhaustive]
57pub enum DecodeTextError {
58    #[snafu(display("{}", message))]
62    DecodeCustom {
63        message: Cow<'static, str>,
65        backtrace: Backtrace,
67    },
68}
69
70type EncodeResult<T> = Result<T, EncodeTextError>;
71type DecodeResult<T> = Result<T, DecodeTextError>;
72
73pub trait TextCodec {
76    fn name(&self) -> Cow<'static, str>;
84
85    fn decode(&self, text: &[u8]) -> DecodeResult<String>;
89
90    fn encode(&self, text: &str) -> EncodeResult<Vec<u8>>;
94}
95
96impl<T: ?Sized> TextCodec for Box<T>
97where
98    T: TextCodec,
99{
100    fn name(&self) -> Cow<'static, str> {
101        self.as_ref().name()
102    }
103
104    fn decode(&self, text: &[u8]) -> DecodeResult<String> {
105        self.as_ref().decode(text)
106    }
107
108    fn encode(&self, text: &str) -> EncodeResult<Vec<u8>> {
109        self.as_ref().encode(text)
110    }
111}
112
113impl<T: ?Sized> TextCodec for &'_ T
114where
115    T: TextCodec,
116{
117    fn name(&self) -> Cow<'static, str> {
118        (**self).name()
119    }
120
121    fn decode(&self, text: &[u8]) -> DecodeResult<String> {
122        (**self).decode(text)
123    }
124
125    fn encode(&self, text: &str) -> EncodeResult<Vec<u8>> {
126        (**self).encode(text)
127    }
128}
129
130#[derive(Debug, Default, Clone, PartialEq)]
147pub struct SpecificCharacterSet(CharsetImpl);
148
149impl SpecificCharacterSet {
150    pub const ISO_IR_6: SpecificCharacterSet = SpecificCharacterSet(CharsetImpl::Default);
152
153    pub const ISO_IR_100: SpecificCharacterSet = SpecificCharacterSet(CharsetImpl::IsoIr100);
155
156    pub const ISO_IR_192: SpecificCharacterSet = SpecificCharacterSet(CharsetImpl::IsoIr192);
158
159    pub fn from_code(code: &str) -> Option<Self> {
173        CharsetImpl::from_code(code).map(SpecificCharacterSet)
174    }
175}
176
177impl TextCodec for SpecificCharacterSet {
178    fn name(&self) -> Cow<'static, str> {
179        self.0.name()
180    }
181
182    fn decode(&self, text: &[u8]) -> DecodeResult<String> {
183        self.0.decode(text)
184    }
185
186    fn encode(&self, text: &str) -> EncodeResult<Vec<u8>> {
187        self.0.encode(text)
188    }
189}
190
191#[derive(Debug, Default, Copy, Clone, Eq, PartialEq, PartialOrd, Ord)]
193#[non_exhaustive]
194enum CharsetImpl {
195    #[default]
197    Default,
198    IsoIr13,
200    IsoIr87,
202    IsoIr100,
205    IsoIr101,
208    IsoIr109,
211    IsoIr110,
214    IsoIr126,
216    IsoIr127,
218    IsoIr138,
220    IsoIr144,
222    IsoIr149,
224    IsoIr166,
226    IsoIr192,
228    Gb18030,
230    Gbk,
232    }
234
235impl CharsetImpl {
236    pub fn from_code(uid: &str) -> Option<Self> {
241        use self::CharsetImpl::*;
242        match uid.trim_end() {
243            "Default" | "ISO_IR_6" | "ISO_IR 6" | "ISO 2022 IR 6" => Some(Default),
244            "ISO_IR_13" | "ISO_IR 13" | "ISO 2022 IR 13" => Some(IsoIr13),
245            "ISO_IR_87" | "ISO_IR 87" | "ISO 2022 IR 87" => Some(IsoIr87),
246            "ISO_IR_100" | "ISO_IR 100" | "ISO 2022 IR 100" => Some(IsoIr100),
247            "ISO_IR_101" | "ISO_IR 101" | "ISO 2022 IR 101" => Some(IsoIr101),
248            "ISO_IR_109" | "ISO_IR 109" | "ISO 2022 IR 109" => Some(IsoIr109),
249            "ISO_IR_110" | "ISO_IR 110" | "ISO 2022 IR 110" => Some(IsoIr110),
250            "ISO_IR_126" | "ISO_IR 126" | "ISO 2022 IR 126" => Some(IsoIr126),
251            "ISO_IR_127" | "ISO_IR 127" | "ISO 2022 IR 127" => Some(IsoIr127),
252            "ISO_IR_138" | "ISO_IR 138" | "ISO 2022 IR 138" => Some(IsoIr138),
253            "ISO_IR_144" | "ISO_IR 144" | "ISO 2022 IR 144" => Some(IsoIr144),
254            "ISO_IR_149" | "ISO_IR 149" | "ISO 2022 IR 149" => Some(IsoIr149),
255            "ISO_IR_166" | "ISO_IR 166" | "ISO 2022 IR 166" => Some(IsoIr166),
256            "ISO_IR_192" | "ISO_IR 192" => Some(IsoIr192),
257            "GB18030" => Some(Gb18030),
258            "GBK" | "GB2312" | "ISO 2022 IR 58" => Some(Gbk),
259            _ => None,
260        }
261    }
262}
263
264impl TextCodec for CharsetImpl {
265    fn name(&self) -> Cow<'static, str> {
266        Cow::Borrowed(match self {
267            CharsetImpl::Default => "ISO_IR 6",
268            CharsetImpl::IsoIr13 => "ISO_IR 13",
269            CharsetImpl::IsoIr87 => "ISO_IR 87",
270            CharsetImpl::IsoIr100 => "ISO_IR 100",
271            CharsetImpl::IsoIr101 => "ISO_IR 101",
272            CharsetImpl::IsoIr109 => "ISO_IR 109",
273            CharsetImpl::IsoIr110 => "ISO_IR 110",
274            CharsetImpl::IsoIr126 => "ISO_IR 126",
275            CharsetImpl::IsoIr127 => "ISO_IR 127",
276            CharsetImpl::IsoIr138 => "ISO_IR 138",
277            CharsetImpl::IsoIr144 => "ISO_IR 144",
278            CharsetImpl::IsoIr149 => "ISO_IR 149",
279            CharsetImpl::IsoIr166 => "ISO_IR 166",
280            CharsetImpl::IsoIr192 => "ISO_IR 192",
281            CharsetImpl::Gb18030 => "GB18030",
282            CharsetImpl::Gbk => "GBK",
283        })
284    }
285
286    fn decode(&self, text: &[u8]) -> DecodeResult<String> {
287        match self {
288            CharsetImpl::Default => DefaultCharacterSetCodec.decode(text),
289            CharsetImpl::IsoIr13 => IsoIr13CharacterSetCodec.decode(text),
290            CharsetImpl::IsoIr87 => IsoIr87CharacterSetCodec.decode(text),
291            CharsetImpl::IsoIr100 => IsoIr100CharacterSetCodec.decode(text),
292            CharsetImpl::IsoIr101 => IsoIr101CharacterSetCodec.decode(text),
293            CharsetImpl::IsoIr109 => IsoIr109CharacterSetCodec.decode(text),
294            CharsetImpl::IsoIr110 => IsoIr110CharacterSetCodec.decode(text),
295            CharsetImpl::IsoIr126 => IsoIr126CharacterSetCodec.decode(text),
296            CharsetImpl::IsoIr127 => IsoIr127CharacterSetCodec.decode(text),
297            CharsetImpl::IsoIr138 => IsoIr138CharacterSetCodec.decode(text),
298            CharsetImpl::IsoIr144 => IsoIr144CharacterSetCodec.decode(text),
299            CharsetImpl::IsoIr149 => IsoIr149CharacterSetCodec.decode(text),
300            CharsetImpl::IsoIr166 => IsoIr166CharacterSetCodec.decode(text),
301            CharsetImpl::IsoIr192 => Utf8CharacterSetCodec.decode(text),
302            CharsetImpl::Gb18030 => Gb18030CharacterSetCodec.decode(text),
303            CharsetImpl::Gbk => GBKCharacterSetCodec.decode(text),
304        }
305    }
306
307    fn encode(&self, text: &str) -> EncodeResult<Vec<u8>> {
308        match self {
309            CharsetImpl::Default => DefaultCharacterSetCodec.encode(text),
310            CharsetImpl::IsoIr13 => IsoIr13CharacterSetCodec.encode(text),
311            CharsetImpl::IsoIr87 => IsoIr87CharacterSetCodec.encode(text),
312            CharsetImpl::IsoIr100 => IsoIr100CharacterSetCodec.encode(text),
313            CharsetImpl::IsoIr101 => IsoIr101CharacterSetCodec.encode(text),
314            CharsetImpl::IsoIr109 => IsoIr109CharacterSetCodec.encode(text),
315            CharsetImpl::IsoIr110 => IsoIr110CharacterSetCodec.encode(text),
316            CharsetImpl::IsoIr126 => IsoIr126CharacterSetCodec.encode(text),
317            CharsetImpl::IsoIr127 => IsoIr127CharacterSetCodec.encode(text),
318            CharsetImpl::IsoIr138 => IsoIr138CharacterSetCodec.encode(text),
319            CharsetImpl::IsoIr144 => IsoIr144CharacterSetCodec.encode(text),
320            CharsetImpl::IsoIr149 => IsoIr149CharacterSetCodec.encode(text),
321            CharsetImpl::IsoIr166 => IsoIr166CharacterSetCodec.encode(text),
322            CharsetImpl::IsoIr192 => Utf8CharacterSetCodec.encode(text),
323            CharsetImpl::Gb18030 => Gb18030CharacterSetCodec.encode(text),
324            CharsetImpl::Gbk => GBKCharacterSetCodec.encode(text),
325        }
326    }
327}
328
329fn decode_text_trap(
330    _decoder: &mut dyn RawDecoder,
331    input: &[u8],
332    output: &mut dyn StringWriter,
333) -> bool {
334    let c = input[0];
335    let o0 = c & 7;
336    let o1 = (c & 56) >> 3;
337    let o2 = (c & 192) >> 6;
338    output.write_char('\\');
339    output.write_char((o2 + b'0') as char);
340    output.write_char((o1 + b'0') as char);
341    output.write_char((o0 + b'0') as char);
342    true
343}
344
345macro_rules! decl_character_set {
347    ($typ: ident, $term: literal, $val: expr) => {
348        #[derive(Debug, Default, Copy, Clone, Eq, Hash, PartialEq)]
349        #[doc = "Data type for the "]
350        #[doc = $term]
351        #[doc = "character set encoding."]
352        pub struct $typ;
353
354        impl TextCodec for $typ {
355            fn name(&self) -> Cow<'static, str> {
356                Cow::Borrowed($term)
357            }
358
359            fn decode(&self, text: &[u8]) -> DecodeResult<String> {
360                $val.decode(text, DecoderTrap::Call(decode_text_trap))
361                    .map_err(|message| DecodeCustomSnafu { message }.build())
362            }
363
364            fn encode(&self, text: &str) -> EncodeResult<Vec<u8>> {
365                $val.encode(text, EncoderTrap::Strict)
366                    .map_err(|message| EncodeCustomSnafu { message }.build())
367            }
368        }
369    };
370}
371
372#[derive(Debug, Default, Copy, Clone, Eq, Hash, PartialEq)]
374pub struct DefaultCharacterSetCodec;
375
376impl TextCodec for DefaultCharacterSetCodec {
377    fn name(&self) -> Cow<'static, str> {
378        Cow::Borrowed("ISO_IR 6")
379    }
380
381    fn decode(&self, text: &[u8]) -> DecodeResult<String> {
382        ISO_8859_1
385            .decode(text, DecoderTrap::Call(decode_text_trap))
386            .map_err(|message| DecodeCustomSnafu { message }.build())
387    }
388
389    fn encode(&self, text: &str) -> EncodeResult<Vec<u8>> {
390        ISO_8859_1
391            .encode(text, EncoderTrap::Strict)
392            .map_err(|message| EncodeCustomSnafu { message }.build())
393    }
394}
395
396decl_character_set!(IsoIr13CharacterSetCodec, "ISO_IR 13", WINDOWS_31J);
397decl_character_set!(IsoIr87CharacterSetCodec, "ISO_IR 87", ISO_2022_JP);
398decl_character_set!(IsoIr100CharacterSetCodec, "ISO_IR 100", ISO_8859_1);
399decl_character_set!(IsoIr101CharacterSetCodec, "ISO_IR 101", ISO_8859_2);
400decl_character_set!(IsoIr109CharacterSetCodec, "ISO_IR 109", ISO_8859_3);
401decl_character_set!(IsoIr110CharacterSetCodec, "ISO_IR 110", ISO_8859_4);
402decl_character_set!(IsoIr126CharacterSetCodec, "ISO_IR 126", ISO_8859_7);
403decl_character_set!(IsoIr127CharacterSetCodec, "ISO_IR 127", ISO_8859_6);
404decl_character_set!(IsoIr138CharacterSetCodec, "ISO_IR 138", ISO_8859_8);
405decl_character_set!(IsoIr144CharacterSetCodec, "ISO_IR 144", ISO_8859_5);
406decl_character_set!(IsoIr149CharacterSetCodec, "ISO_IR 149", WINDOWS_949);
407decl_character_set!(IsoIr166CharacterSetCodec, "ISO_IR 166", WINDOWS_874);
408decl_character_set!(Utf8CharacterSetCodec, "ISO_IR 192", UTF_8);
409decl_character_set!(Gb18030CharacterSetCodec, "GB18030", GB18030);
410decl_character_set!(GBKCharacterSetCodec, "GBK", GBK);
411
412#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
414pub enum TextValidationOutcome {
415    Ok,
417    BadCharacters,
419    NotOk,
421}
422
423pub fn validate_iso_8859(text: &[u8]) -> TextValidationOutcome {
425    if ISO_8859_1.decode(text, DecoderTrap::Strict).is_err() {
426        match ISO_8859_1.decode(text, DecoderTrap::Call(decode_text_trap)) {
427            Ok(_) => TextValidationOutcome::BadCharacters,
428            Err(_) => TextValidationOutcome::NotOk,
429        }
430    } else {
431        TextValidationOutcome::Ok
432    }
433}
434
435pub fn validate_da(text: &[u8]) -> TextValidationOutcome {
438    if text.iter().cloned().all(|c| c.is_ascii_digit()) {
439        TextValidationOutcome::Ok
440    } else {
441        TextValidationOutcome::NotOk
442    }
443}
444
445pub fn validate_tm(text: &[u8]) -> TextValidationOutcome {
448    if text.iter().cloned().all(|c| match c {
449        b'\\' | b'.' | b'-' | b' ' => true,
450        c => c.is_ascii_digit(),
451    }) {
452        TextValidationOutcome::Ok
453    } else {
454        TextValidationOutcome::NotOk
455    }
456}
457
458pub fn validate_dt(text: &[u8]) -> TextValidationOutcome {
461    if text.iter().cloned().all(|c| match c {
462        b'.' | b'-' | b'+' | b' ' | b'\\' => true,
463        c => c.is_ascii_digit(),
464    }) {
465        TextValidationOutcome::Ok
466    } else {
467        TextValidationOutcome::NotOk
468    }
469}
470
471pub fn validate_cs(text: &[u8]) -> TextValidationOutcome {
474    if text.iter().cloned().all(|c| match c {
475        b' ' | b'_' => true,
476        c => c.is_ascii_digit() || c.is_ascii_uppercase(),
477    }) {
478        TextValidationOutcome::Ok
479    } else {
480        TextValidationOutcome::NotOk
481    }
482}
483
484#[cfg(test)]
485mod tests {
486    use super::*;
487
488    fn test_codec<T>(codec: T, string: &str, bytes: &[u8])
489    where
490        T: TextCodec,
491    {
492        assert_eq!(codec.encode(string).expect("encoding"), bytes);
493        assert_eq!(codec.decode(bytes).expect("decoding"), string);
494    }
495
496    #[test]
497    fn iso_ir_6_baseline() {
498        let codec = SpecificCharacterSet::default();
499        test_codec(codec, "Smith^John", b"Smith^John");
500    }
501
502    #[test]
503    fn iso_ir_13_baseline() {
504        let codec = SpecificCharacterSet(CharsetImpl::IsoIr13);
505        test_codec(codec, "ヤマダ^タロウ", b"\xd4\xcf\xc0\xde^\xc0\xdb\xb3");
506    }
507
508    #[test]
509    fn iso_ir_87_baseline() {
510        let codec = SpecificCharacterSet(CharsetImpl::IsoIr87);
511        test_codec(&codec, "山田^太郎", b"\x1b$B;3ED\x1b(B^\x1b$BB@O:");
512        test_codec(&codec, "やまだ^たろう", b"\x1b$B$d$^$@\x1b(B^\x1b$B$?$m$&");
513    }
514
515    #[test]
516    fn iso_ir_192_baseline() {
517        let codec = SpecificCharacterSet::ISO_IR_192;
518        test_codec(&codec, "Simões^John", "Simões^John".as_bytes());
519        test_codec(codec, "Иванков^Андрей", "Иванков^Андрей".as_bytes());
520    }
521
522    #[test]
523    fn iso_ir_100_baseline() {
524        let codec = SpecificCharacterSet(CharsetImpl::IsoIr100);
525        test_codec(&codec, "Simões^João", b"Sim\xF5es^Jo\xE3o");
526        test_codec(codec, "Günther^Hans", b"G\xfcnther^Hans");
527    }
528
529    #[test]
530    fn iso_ir_101_baseline() {
531        let codec = SpecificCharacterSet(CharsetImpl::IsoIr101);
532        test_codec(codec, "Günther^Hans", b"G\xfcnther^Hans");
533    }
534
535    #[test]
536    fn iso_ir_110_baseline() {
537        let codec = SpecificCharacterSet(CharsetImpl::IsoIr110);
538        test_codec(codec, "ĄĸŖĨϧŠĒĢŦŽĀÁÂÃÄÅÆĮČÉ^ĘËĖÍÎĪĐŅŌĶÔÕÖרŲÚÛÜŨŪß", b"\xA1\xA2\xA3\xA5\xA6\xA7\xA9\xAA\xAB\xAC\xAE\xC0\xC1\xC2\xC3\xC4\xC5\xC6\xC7\xC8\xC9^\xCA\xCB\xCC\xCD\xCE\xCF\xD0\xD1\xD2\xD3\xD4\xD5\xD6\xD7\xD8\xD9\xDA\xDB\xDC\xDD\xDE\xDF");
539    }
540
541    #[test]
542    fn iso_ir_126_baseline() {
543        let codec = SpecificCharacterSet(CharsetImpl::IsoIr126);
544        test_codec(codec, "Διονυσιος", b"\xC4\xE9\xEF\xED\xF5\xF3\xE9\xEF\xF2");
545    }
546
547    #[test]
548    fn iso_ir_127_baseline() {
549        let codec = SpecificCharacterSet(CharsetImpl::IsoIr127);
550        test_codec(
551            codec,
552            "قباني^لنزار",
553            b"\xE2\xC8\xC7\xE6\xEA^\xE4\xE6\xD2\xC7\xD1",
554        );
555    }
556
557    #[test]
558    fn iso_ir_138_baseline() {
559        let codec = SpecificCharacterSet(CharsetImpl::IsoIr138);
560        test_codec(
561            &codec,
562            "מקור השם עברית",
563            b"\xEE\xF7\xE5\xF8\x20\xE4\xF9\xED\x20\xF2\xE1\xF8\xE9\xFA",
564        );
565        test_codec(
566            codec,
567            "שרון^דבורה",
568            b"\xF9\xF8\xE5\xEF^\xE3\xE1\xE5\xF8\xE4",
569        );
570    }
571
572    #[test]
573    fn iso_ir_144_baseline() {
574        let codec = SpecificCharacterSet(CharsetImpl::IsoIr144);
575        test_codec(
576            &codec,
577            "Иванков^Андрей",
578            b"\xb8\xd2\xd0\xdd\xda\xde\xd2^\xb0\xdd\xd4\xe0\xd5\xd9",
579        );
580        test_codec(
581            &codec,
582            "Гол. мозг стандарт",
583            b"\xB3\xDE\xDB.\x20\xDC\xDE\xD7\xD3\x20\xE1\xE2\xD0\xDD\xD4\xD0\xE0\xE2",
584        );
585        test_codec(&codec, "мозг 2мм", b"\xDC\xDE\xD7\xD3\x202\xDC\xDC");
586    }
587
588    #[test]
589    fn iso_ir_149_baseline() {
590        let codec = SpecificCharacterSet(CharsetImpl::IsoIr149);
591        test_codec(&codec, "김희중", b"\xB1\xE8\xC8\xF1\xC1\xDF");
592        test_codec(
593            codec,
594            "Hong^Gildong=洪^吉洞=홍^길동",
595            b"Hong^Gildong=\xFB\xF3^\xD1\xCE\xD4\xD7=\xC8\xAB^\xB1\xE6\xB5\xBF",
596        );
597    }
598
599    #[test]
600    fn iso_ir_166_baseline() {
601        let codec = SpecificCharacterSet(CharsetImpl::IsoIr166);
602        test_codec(&codec, "ประเทศไทย", b"\xBB\xC3\xD0\xE0\xB7\xC8\xE4\xB7\xC2");
603        test_codec(codec, "รหัสสำหรับอักขระไทยที่ใช้กับคอมพิวเตอร์", b"\xC3\xCB\xD1\xCA\xCA\xD3\xCB\xC3\xD1\xBA\xCD\xD1\xA1\xA2\xC3\xD0\xE4\xB7\xC2\xB7\xD5\xE8\xE3\xAA\xE9\xA1\xD1\xBA\xA4\xCD\xC1\xBE\xD4\xC7\xE0\xB5\xCD\xC3\xEC");
604    }
605
606    #[test]
607    fn gb_18030_baseline() {
608        let codec = SpecificCharacterSet(CharsetImpl::Gb18030);
609        test_codec(
610            &codec,
611            "Wang^XiaoDong=王^小东",
612            b"Wang^XiaoDong=\xCD\xF5^\xD0\xA1\xB6\xAB",
613        );
614    }
615    #[test]
616    fn gb_gbk_baseline() {
617        let codec = SpecificCharacterSet(CharsetImpl::Gbk);
618
619        let iso2022_ir58_bytes = vec![
620            0xB0, 0xB2, 0xBB, 0xD5, 0xD0, 0xC7, 0xC1, 0xE9, 0xD0, 0xC5, 0xCF, 0xA2, 0xBF, 0xC6,
621            0xBC, 0xBC, 0xD3, 0xD0, 0xCF, 0xDE, 0xB9, 0xAB, 0xCB, 0xBE,
622        ];
623        let rw = codec.decode(&iso2022_ir58_bytes).expect("decoding");
624
625        assert_eq!(rw, "安徽星灵信息科技有限公司");
626
627        let gb2312_bytes = vec![
628            0xCA, 0xB9, 0xC6, 0xE4, 0xD3, 0xEB, 0xD4, 0xAD, 0xCA, 0xBC, 0xB2, 0xD6, 0xBF, 0xE2,
629            0xB1, 0xA3, 0xB3, 0xD6, 0xD2, 0xBB, 0xD6, 0xC2,
630        ];
631        let rw2 = codec.decode(&gb2312_bytes).expect("decoding");
632
633        assert_eq!(rw2, "使其与原始仓库保持一致");
634    }
635}