1use encoding::all::{
30 GB18030, GBK, ISO_2022_JP, ISO_8859_1, ISO_8859_2, ISO_8859_3, ISO_8859_4, ISO_8859_5,
31 ISO_8859_6, ISO_8859_7, ISO_8859_8, UTF_8, WINDOWS_31J, WINDOWS_874, WINDOWS_949,
32};
33use encoding::{DecoderTrap, EncoderTrap, Encoding, RawDecoder, StringWriter};
34use snafu::{Backtrace, Snafu};
35use std::borrow::Cow;
36use std::fmt::Debug;
37
38#[derive(Debug, Snafu)]
40#[non_exhaustive]
41pub enum EncodeTextError {
42 #[snafu(display("{}", message))]
46 EncodeCustom {
47 message: Cow<'static, str>,
49 backtrace: Backtrace,
51 },
52}
53
54#[derive(Debug, Snafu)]
56#[non_exhaustive]
57pub enum DecodeTextError {
58 #[snafu(display("{}", message))]
62 DecodeCustom {
63 message: Cow<'static, str>,
65 backtrace: Backtrace,
67 },
68}
69
70type EncodeResult<T> = Result<T, EncodeTextError>;
71type DecodeResult<T> = Result<T, DecodeTextError>;
72
73pub trait TextCodec {
76 fn name(&self) -> Cow<'static, str>;
84
85 fn decode(&self, text: &[u8]) -> DecodeResult<String>;
89
90 fn encode(&self, text: &str) -> EncodeResult<Vec<u8>>;
94}
95
96impl<T: ?Sized> TextCodec for Box<T>
97where
98 T: TextCodec,
99{
100 fn name(&self) -> Cow<'static, str> {
101 self.as_ref().name()
102 }
103
104 fn decode(&self, text: &[u8]) -> DecodeResult<String> {
105 self.as_ref().decode(text)
106 }
107
108 fn encode(&self, text: &str) -> EncodeResult<Vec<u8>> {
109 self.as_ref().encode(text)
110 }
111}
112
113impl<T: ?Sized> TextCodec for &'_ T
114where
115 T: TextCodec,
116{
117 fn name(&self) -> Cow<'static, str> {
118 (**self).name()
119 }
120
121 fn decode(&self, text: &[u8]) -> DecodeResult<String> {
122 (**self).decode(text)
123 }
124
125 fn encode(&self, text: &str) -> EncodeResult<Vec<u8>> {
126 (**self).encode(text)
127 }
128}
129
130#[derive(Debug, Default, Clone, PartialEq)]
147pub struct SpecificCharacterSet(CharsetImpl);
148
149impl SpecificCharacterSet {
150 pub const ISO_IR_6: SpecificCharacterSet = SpecificCharacterSet(CharsetImpl::Default);
152
153 pub const ISO_IR_100: SpecificCharacterSet = SpecificCharacterSet(CharsetImpl::IsoIr100);
155
156 pub const ISO_IR_192: SpecificCharacterSet = SpecificCharacterSet(CharsetImpl::IsoIr192);
158
159 pub fn from_code(code: &str) -> Option<Self> {
173 CharsetImpl::from_code(code).map(SpecificCharacterSet)
174 }
175}
176
177impl TextCodec for SpecificCharacterSet {
178 fn name(&self) -> Cow<'static, str> {
179 self.0.name()
180 }
181
182 fn decode(&self, text: &[u8]) -> DecodeResult<String> {
183 self.0.decode(text)
184 }
185
186 fn encode(&self, text: &str) -> EncodeResult<Vec<u8>> {
187 self.0.encode(text)
188 }
189}
190
191#[derive(Debug, Default, Copy, Clone, Eq, PartialEq, PartialOrd, Ord)]
193#[non_exhaustive]
194enum CharsetImpl {
195 #[default]
197 Default,
198 IsoIr13,
200 IsoIr87,
202 IsoIr100,
205 IsoIr101,
208 IsoIr109,
211 IsoIr110,
214 IsoIr126,
216 IsoIr127,
218 IsoIr138,
220 IsoIr144,
222 IsoIr149,
224 IsoIr166,
226 IsoIr192,
228 Gb18030,
230 Gbk,
232 }
234
235impl CharsetImpl {
236 pub fn from_code(uid: &str) -> Option<Self> {
241 use self::CharsetImpl::*;
242 match uid.trim_end() {
243 "Default" | "ISO_IR_6" | "ISO_IR 6" | "ISO 2022 IR 6" => Some(Default),
244 "ISO_IR_13" | "ISO_IR 13" | "ISO 2022 IR 13" => Some(IsoIr13),
245 "ISO_IR_87" | "ISO_IR 87" | "ISO 2022 IR 87" => Some(IsoIr87),
246 "ISO_IR_100" | "ISO_IR 100" | "ISO 2022 IR 100" => Some(IsoIr100),
247 "ISO_IR_101" | "ISO_IR 101" | "ISO 2022 IR 101" => Some(IsoIr101),
248 "ISO_IR_109" | "ISO_IR 109" | "ISO 2022 IR 109" => Some(IsoIr109),
249 "ISO_IR_110" | "ISO_IR 110" | "ISO 2022 IR 110" => Some(IsoIr110),
250 "ISO_IR_126" | "ISO_IR 126" | "ISO 2022 IR 126" => Some(IsoIr126),
251 "ISO_IR_127" | "ISO_IR 127" | "ISO 2022 IR 127" => Some(IsoIr127),
252 "ISO_IR_138" | "ISO_IR 138" | "ISO 2022 IR 138" => Some(IsoIr138),
253 "ISO_IR_144" | "ISO_IR 144" | "ISO 2022 IR 144" => Some(IsoIr144),
254 "ISO_IR_149" | "ISO_IR 149" | "ISO 2022 IR 149" => Some(IsoIr149),
255 "ISO_IR_166" | "ISO_IR 166" | "ISO 2022 IR 166" => Some(IsoIr166),
256 "ISO_IR_192" | "ISO_IR 192" => Some(IsoIr192),
257 "GB18030" => Some(Gb18030),
258 "GBK" | "GB2312" | "ISO 2022 IR 58" => Some(Gbk),
259 _ => None,
260 }
261 }
262}
263
264impl TextCodec for CharsetImpl {
265 fn name(&self) -> Cow<'static, str> {
266 Cow::Borrowed(match self {
267 CharsetImpl::Default => "ISO_IR 6",
268 CharsetImpl::IsoIr13 => "ISO_IR 13",
269 CharsetImpl::IsoIr87 => "ISO_IR 87",
270 CharsetImpl::IsoIr100 => "ISO_IR 100",
271 CharsetImpl::IsoIr101 => "ISO_IR 101",
272 CharsetImpl::IsoIr109 => "ISO_IR 109",
273 CharsetImpl::IsoIr110 => "ISO_IR 110",
274 CharsetImpl::IsoIr126 => "ISO_IR 126",
275 CharsetImpl::IsoIr127 => "ISO_IR 127",
276 CharsetImpl::IsoIr138 => "ISO_IR 138",
277 CharsetImpl::IsoIr144 => "ISO_IR 144",
278 CharsetImpl::IsoIr149 => "ISO_IR 149",
279 CharsetImpl::IsoIr166 => "ISO_IR 166",
280 CharsetImpl::IsoIr192 => "ISO_IR 192",
281 CharsetImpl::Gb18030 => "GB18030",
282 CharsetImpl::Gbk => "GBK",
283 })
284 }
285
286 fn decode(&self, text: &[u8]) -> DecodeResult<String> {
287 match self {
288 CharsetImpl::Default => DefaultCharacterSetCodec.decode(text),
289 CharsetImpl::IsoIr13 => IsoIr13CharacterSetCodec.decode(text),
290 CharsetImpl::IsoIr87 => IsoIr87CharacterSetCodec.decode(text),
291 CharsetImpl::IsoIr100 => IsoIr100CharacterSetCodec.decode(text),
292 CharsetImpl::IsoIr101 => IsoIr101CharacterSetCodec.decode(text),
293 CharsetImpl::IsoIr109 => IsoIr109CharacterSetCodec.decode(text),
294 CharsetImpl::IsoIr110 => IsoIr110CharacterSetCodec.decode(text),
295 CharsetImpl::IsoIr126 => IsoIr126CharacterSetCodec.decode(text),
296 CharsetImpl::IsoIr127 => IsoIr127CharacterSetCodec.decode(text),
297 CharsetImpl::IsoIr138 => IsoIr138CharacterSetCodec.decode(text),
298 CharsetImpl::IsoIr144 => IsoIr144CharacterSetCodec.decode(text),
299 CharsetImpl::IsoIr149 => IsoIr149CharacterSetCodec.decode(text),
300 CharsetImpl::IsoIr166 => IsoIr166CharacterSetCodec.decode(text),
301 CharsetImpl::IsoIr192 => Utf8CharacterSetCodec.decode(text),
302 CharsetImpl::Gb18030 => Gb18030CharacterSetCodec.decode(text),
303 CharsetImpl::Gbk => GBKCharacterSetCodec.decode(text),
304 }
305 }
306
307 fn encode(&self, text: &str) -> EncodeResult<Vec<u8>> {
308 match self {
309 CharsetImpl::Default => DefaultCharacterSetCodec.encode(text),
310 CharsetImpl::IsoIr13 => IsoIr13CharacterSetCodec.encode(text),
311 CharsetImpl::IsoIr87 => IsoIr87CharacterSetCodec.encode(text),
312 CharsetImpl::IsoIr100 => IsoIr100CharacterSetCodec.encode(text),
313 CharsetImpl::IsoIr101 => IsoIr101CharacterSetCodec.encode(text),
314 CharsetImpl::IsoIr109 => IsoIr109CharacterSetCodec.encode(text),
315 CharsetImpl::IsoIr110 => IsoIr110CharacterSetCodec.encode(text),
316 CharsetImpl::IsoIr126 => IsoIr126CharacterSetCodec.encode(text),
317 CharsetImpl::IsoIr127 => IsoIr127CharacterSetCodec.encode(text),
318 CharsetImpl::IsoIr138 => IsoIr138CharacterSetCodec.encode(text),
319 CharsetImpl::IsoIr144 => IsoIr144CharacterSetCodec.encode(text),
320 CharsetImpl::IsoIr149 => IsoIr149CharacterSetCodec.encode(text),
321 CharsetImpl::IsoIr166 => IsoIr166CharacterSetCodec.encode(text),
322 CharsetImpl::IsoIr192 => Utf8CharacterSetCodec.encode(text),
323 CharsetImpl::Gb18030 => Gb18030CharacterSetCodec.encode(text),
324 CharsetImpl::Gbk => GBKCharacterSetCodec.encode(text),
325 }
326 }
327}
328
329fn decode_text_trap(
330 _decoder: &mut dyn RawDecoder,
331 input: &[u8],
332 output: &mut dyn StringWriter,
333) -> bool {
334 let c = input[0];
335 let o0 = c & 7;
336 let o1 = (c & 56) >> 3;
337 let o2 = (c & 192) >> 6;
338 output.write_char('\\');
339 output.write_char((o2 + b'0') as char);
340 output.write_char((o1 + b'0') as char);
341 output.write_char((o0 + b'0') as char);
342 true
343}
344
345macro_rules! decl_character_set {
347 ($typ: ident, $term: literal, $val: expr) => {
348 #[derive(Debug, Default, Copy, Clone, Eq, Hash, PartialEq)]
349 #[doc = "Data type for the "]
350 #[doc = $term]
351 #[doc = "character set encoding."]
352 pub struct $typ;
353
354 impl TextCodec for $typ {
355 fn name(&self) -> Cow<'static, str> {
356 Cow::Borrowed($term)
357 }
358
359 fn decode(&self, text: &[u8]) -> DecodeResult<String> {
360 $val.decode(text, DecoderTrap::Call(decode_text_trap))
361 .map_err(|message| DecodeCustomSnafu { message }.build())
362 }
363
364 fn encode(&self, text: &str) -> EncodeResult<Vec<u8>> {
365 $val.encode(text, EncoderTrap::Strict)
366 .map_err(|message| EncodeCustomSnafu { message }.build())
367 }
368 }
369 };
370}
371
372#[derive(Debug, Default, Copy, Clone, Eq, Hash, PartialEq)]
374pub struct DefaultCharacterSetCodec;
375
376impl TextCodec for DefaultCharacterSetCodec {
377 fn name(&self) -> Cow<'static, str> {
378 Cow::Borrowed("ISO_IR 6")
379 }
380
381 fn decode(&self, text: &[u8]) -> DecodeResult<String> {
382 ISO_8859_1
385 .decode(text, DecoderTrap::Call(decode_text_trap))
386 .map_err(|message| DecodeCustomSnafu { message }.build())
387 }
388
389 fn encode(&self, text: &str) -> EncodeResult<Vec<u8>> {
390 ISO_8859_1
391 .encode(text, EncoderTrap::Strict)
392 .map_err(|message| EncodeCustomSnafu { message }.build())
393 }
394}
395
396decl_character_set!(IsoIr13CharacterSetCodec, "ISO_IR 13", WINDOWS_31J);
397decl_character_set!(IsoIr87CharacterSetCodec, "ISO_IR 87", ISO_2022_JP);
398decl_character_set!(IsoIr100CharacterSetCodec, "ISO_IR 100", ISO_8859_1);
399decl_character_set!(IsoIr101CharacterSetCodec, "ISO_IR 101", ISO_8859_2);
400decl_character_set!(IsoIr109CharacterSetCodec, "ISO_IR 109", ISO_8859_3);
401decl_character_set!(IsoIr110CharacterSetCodec, "ISO_IR 110", ISO_8859_4);
402decl_character_set!(IsoIr126CharacterSetCodec, "ISO_IR 126", ISO_8859_7);
403decl_character_set!(IsoIr127CharacterSetCodec, "ISO_IR 127", ISO_8859_6);
404decl_character_set!(IsoIr138CharacterSetCodec, "ISO_IR 138", ISO_8859_8);
405decl_character_set!(IsoIr144CharacterSetCodec, "ISO_IR 144", ISO_8859_5);
406decl_character_set!(IsoIr149CharacterSetCodec, "ISO_IR 149", WINDOWS_949);
407decl_character_set!(IsoIr166CharacterSetCodec, "ISO_IR 166", WINDOWS_874);
408decl_character_set!(Utf8CharacterSetCodec, "ISO_IR 192", UTF_8);
409decl_character_set!(Gb18030CharacterSetCodec, "GB18030", GB18030);
410decl_character_set!(GBKCharacterSetCodec, "GBK", GBK);
411
412#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
414pub enum TextValidationOutcome {
415 Ok,
417 BadCharacters,
419 NotOk,
421}
422
423pub fn validate_iso_8859(text: &[u8]) -> TextValidationOutcome {
425 if ISO_8859_1.decode(text, DecoderTrap::Strict).is_err() {
426 match ISO_8859_1.decode(text, DecoderTrap::Call(decode_text_trap)) {
427 Ok(_) => TextValidationOutcome::BadCharacters,
428 Err(_) => TextValidationOutcome::NotOk,
429 }
430 } else {
431 TextValidationOutcome::Ok
432 }
433}
434
435pub fn validate_da(text: &[u8]) -> TextValidationOutcome {
438 if text.iter().cloned().all(|c| c.is_ascii_digit()) {
439 TextValidationOutcome::Ok
440 } else {
441 TextValidationOutcome::NotOk
442 }
443}
444
445pub fn validate_tm(text: &[u8]) -> TextValidationOutcome {
448 if text.iter().cloned().all(|c| match c {
449 b'\\' | b'.' | b'-' | b' ' => true,
450 c => c.is_ascii_digit(),
451 }) {
452 TextValidationOutcome::Ok
453 } else {
454 TextValidationOutcome::NotOk
455 }
456}
457
458pub fn validate_dt(text: &[u8]) -> TextValidationOutcome {
461 if text.iter().cloned().all(|c| match c {
462 b'.' | b'-' | b'+' | b' ' | b'\\' => true,
463 c => c.is_ascii_digit(),
464 }) {
465 TextValidationOutcome::Ok
466 } else {
467 TextValidationOutcome::NotOk
468 }
469}
470
471pub fn validate_cs(text: &[u8]) -> TextValidationOutcome {
474 if text.iter().cloned().all(|c| match c {
475 b' ' | b'_' => true,
476 c => c.is_ascii_digit() || c.is_ascii_uppercase(),
477 }) {
478 TextValidationOutcome::Ok
479 } else {
480 TextValidationOutcome::NotOk
481 }
482}
483
484#[cfg(test)]
485mod tests {
486 use super::*;
487
488 fn test_codec<T>(codec: T, string: &str, bytes: &[u8])
489 where
490 T: TextCodec,
491 {
492 assert_eq!(codec.encode(string).expect("encoding"), bytes);
493 assert_eq!(codec.decode(bytes).expect("decoding"), string);
494 }
495
496 #[test]
497 fn iso_ir_6_baseline() {
498 let codec = SpecificCharacterSet::default();
499 test_codec(codec, "Smith^John", b"Smith^John");
500 }
501
502 #[test]
503 fn iso_ir_13_baseline() {
504 let codec = SpecificCharacterSet(CharsetImpl::IsoIr13);
505 test_codec(codec, "ヤマダ^タロウ", b"\xd4\xcf\xc0\xde^\xc0\xdb\xb3");
506 }
507
508 #[test]
509 fn iso_ir_87_baseline() {
510 let codec = SpecificCharacterSet(CharsetImpl::IsoIr87);
511 test_codec(&codec, "山田^太郎", b"\x1b$B;3ED\x1b(B^\x1b$BB@O:");
512 test_codec(&codec, "やまだ^たろう", b"\x1b$B$d$^$@\x1b(B^\x1b$B$?$m$&");
513 }
514
515 #[test]
516 fn iso_ir_192_baseline() {
517 let codec = SpecificCharacterSet::ISO_IR_192;
518 test_codec(&codec, "Simões^John", "Simões^John".as_bytes());
519 test_codec(codec, "Иванков^Андрей", "Иванков^Андрей".as_bytes());
520 }
521
522 #[test]
523 fn iso_ir_100_baseline() {
524 let codec = SpecificCharacterSet(CharsetImpl::IsoIr100);
525 test_codec(&codec, "Simões^João", b"Sim\xF5es^Jo\xE3o");
526 test_codec(codec, "Günther^Hans", b"G\xfcnther^Hans");
527 }
528
529 #[test]
530 fn iso_ir_101_baseline() {
531 let codec = SpecificCharacterSet(CharsetImpl::IsoIr101);
532 test_codec(codec, "Günther^Hans", b"G\xfcnther^Hans");
533 }
534
535 #[test]
536 fn iso_ir_110_baseline() {
537 let codec = SpecificCharacterSet(CharsetImpl::IsoIr110);
538 test_codec(codec, "ĄĸŖĨϧŠĒĢŦŽĀÁÂÃÄÅÆĮČÉ^ĘËĖÍÎĪĐŅŌĶÔÕÖרŲÚÛÜŨŪß", b"\xA1\xA2\xA3\xA5\xA6\xA7\xA9\xAA\xAB\xAC\xAE\xC0\xC1\xC2\xC3\xC4\xC5\xC6\xC7\xC8\xC9^\xCA\xCB\xCC\xCD\xCE\xCF\xD0\xD1\xD2\xD3\xD4\xD5\xD6\xD7\xD8\xD9\xDA\xDB\xDC\xDD\xDE\xDF");
539 }
540
541 #[test]
542 fn iso_ir_126_baseline() {
543 let codec = SpecificCharacterSet(CharsetImpl::IsoIr126);
544 test_codec(codec, "Διονυσιος", b"\xC4\xE9\xEF\xED\xF5\xF3\xE9\xEF\xF2");
545 }
546
547 #[test]
548 fn iso_ir_127_baseline() {
549 let codec = SpecificCharacterSet(CharsetImpl::IsoIr127);
550 test_codec(
551 codec,
552 "قباني^لنزار",
553 b"\xE2\xC8\xC7\xE6\xEA^\xE4\xE6\xD2\xC7\xD1",
554 );
555 }
556
557 #[test]
558 fn iso_ir_138_baseline() {
559 let codec = SpecificCharacterSet(CharsetImpl::IsoIr138);
560 test_codec(
561 &codec,
562 "מקור השם עברית",
563 b"\xEE\xF7\xE5\xF8\x20\xE4\xF9\xED\x20\xF2\xE1\xF8\xE9\xFA",
564 );
565 test_codec(
566 codec,
567 "שרון^דבורה",
568 b"\xF9\xF8\xE5\xEF^\xE3\xE1\xE5\xF8\xE4",
569 );
570 }
571
572 #[test]
573 fn iso_ir_144_baseline() {
574 let codec = SpecificCharacterSet(CharsetImpl::IsoIr144);
575 test_codec(
576 &codec,
577 "Иванков^Андрей",
578 b"\xb8\xd2\xd0\xdd\xda\xde\xd2^\xb0\xdd\xd4\xe0\xd5\xd9",
579 );
580 test_codec(
581 &codec,
582 "Гол. мозг стандарт",
583 b"\xB3\xDE\xDB.\x20\xDC\xDE\xD7\xD3\x20\xE1\xE2\xD0\xDD\xD4\xD0\xE0\xE2",
584 );
585 test_codec(&codec, "мозг 2мм", b"\xDC\xDE\xD7\xD3\x202\xDC\xDC");
586 }
587
588 #[test]
589 fn iso_ir_149_baseline() {
590 let codec = SpecificCharacterSet(CharsetImpl::IsoIr149);
591 test_codec(&codec, "김희중", b"\xB1\xE8\xC8\xF1\xC1\xDF");
592 test_codec(
593 codec,
594 "Hong^Gildong=洪^吉洞=홍^길동",
595 b"Hong^Gildong=\xFB\xF3^\xD1\xCE\xD4\xD7=\xC8\xAB^\xB1\xE6\xB5\xBF",
596 );
597 }
598
599 #[test]
600 fn iso_ir_166_baseline() {
601 let codec = SpecificCharacterSet(CharsetImpl::IsoIr166);
602 test_codec(&codec, "ประเทศไทย", b"\xBB\xC3\xD0\xE0\xB7\xC8\xE4\xB7\xC2");
603 test_codec(codec, "รหัสสำหรับอักขระไทยที่ใช้กับคอมพิวเตอร์", b"\xC3\xCB\xD1\xCA\xCA\xD3\xCB\xC3\xD1\xBA\xCD\xD1\xA1\xA2\xC3\xD0\xE4\xB7\xC2\xB7\xD5\xE8\xE3\xAA\xE9\xA1\xD1\xBA\xA4\xCD\xC1\xBE\xD4\xC7\xE0\xB5\xCD\xC3\xEC");
604 }
605
606 #[test]
607 fn gb_18030_baseline() {
608 let codec = SpecificCharacterSet(CharsetImpl::Gb18030);
609 test_codec(
610 &codec,
611 "Wang^XiaoDong=王^小东",
612 b"Wang^XiaoDong=\xCD\xF5^\xD0\xA1\xB6\xAB",
613 );
614 }
615 #[test]
616 fn gb_gbk_baseline() {
617 let codec = SpecificCharacterSet(CharsetImpl::Gbk);
618
619 let iso2022_ir58_bytes = vec![
620 0xB0, 0xB2, 0xBB, 0xD5, 0xD0, 0xC7, 0xC1, 0xE9, 0xD0, 0xC5, 0xCF, 0xA2, 0xBF, 0xC6,
621 0xBC, 0xBC, 0xD3, 0xD0, 0xCF, 0xDE, 0xB9, 0xAB, 0xCB, 0xBE,
622 ];
623 let rw = codec.decode(&iso2022_ir58_bytes).expect("decoding");
624
625 assert_eq!(rw, "安徽星灵信息科技有限公司");
626
627 let gb2312_bytes = vec![
628 0xCA, 0xB9, 0xC6, 0xE4, 0xD3, 0xEB, 0xD4, 0xAD, 0xCA, 0xBC, 0xB2, 0xD6, 0xBF, 0xE2,
629 0xB1, 0xA3, 0xB3, 0xD6, 0xD2, 0xBB, 0xD6, 0xC2,
630 ];
631 let rw2 = codec.decode(&gb2312_bytes).expect("decoding");
632
633 assert_eq!(rw2, "使其与原始仓库保持一致");
634 }
635}