1use encoding::all::{
30 GB18030, ISO_2022_JP, ISO_8859_1, ISO_8859_2, ISO_8859_3, ISO_8859_4, ISO_8859_5, ISO_8859_6,
31 ISO_8859_7, ISO_8859_8, UTF_8, WINDOWS_31J, WINDOWS_874, WINDOWS_949,
32};
33use encoding::{DecoderTrap, EncoderTrap, Encoding, RawDecoder, StringWriter};
34use snafu::{Backtrace, Snafu};
35use std::borrow::Cow;
36use std::fmt::Debug;
37
38#[derive(Debug, Snafu)]
40#[non_exhaustive]
41pub enum EncodeTextError {
42 #[snafu(display("{}", message))]
46 EncodeCustom {
47 message: Cow<'static, str>,
49 backtrace: Backtrace,
51 },
52}
53
54#[derive(Debug, Snafu)]
56#[non_exhaustive]
57pub enum DecodeTextError {
58 #[snafu(display("{}", message))]
62 DecodeCustom {
63 message: Cow<'static, str>,
65 backtrace: Backtrace,
67 },
68}
69
70type EncodeResult<T> = Result<T, EncodeTextError>;
71type DecodeResult<T> = Result<T, DecodeTextError>;
72
73pub trait TextCodec {
76 fn name(&self) -> Cow<'static, str>;
84
85 fn decode(&self, text: &[u8]) -> DecodeResult<String>;
89
90 fn encode(&self, text: &str) -> EncodeResult<Vec<u8>>;
94}
95
96impl<T: ?Sized> TextCodec for Box<T>
97where
98 T: TextCodec,
99{
100 fn name(&self) -> Cow<'static, str> {
101 self.as_ref().name()
102 }
103
104 fn decode(&self, text: &[u8]) -> DecodeResult<String> {
105 self.as_ref().decode(text)
106 }
107
108 fn encode(&self, text: &str) -> EncodeResult<Vec<u8>> {
109 self.as_ref().encode(text)
110 }
111}
112
113impl<T: ?Sized> TextCodec for &'_ T
114where
115 T: TextCodec,
116{
117 fn name(&self) -> Cow<'static, str> {
118 (**self).name()
119 }
120
121 fn decode(&self, text: &[u8]) -> DecodeResult<String> {
122 (**self).decode(text)
123 }
124
125 fn encode(&self, text: &str) -> EncodeResult<Vec<u8>> {
126 (**self).encode(text)
127 }
128}
129
130#[derive(Debug, Default, Clone, PartialEq)]
147pub struct SpecificCharacterSet(CharsetImpl);
148
149impl SpecificCharacterSet {
150 pub const ISO_IR_6: SpecificCharacterSet = SpecificCharacterSet(CharsetImpl::Default);
152
153 pub const ISO_IR_100: SpecificCharacterSet = SpecificCharacterSet(CharsetImpl::IsoIr100);
155
156 pub const ISO_IR_192: SpecificCharacterSet = SpecificCharacterSet(CharsetImpl::IsoIr192);
158
159 pub fn from_code(code: &str) -> Option<Self> {
173 CharsetImpl::from_code(code).map(SpecificCharacterSet)
174 }
175}
176
177impl TextCodec for SpecificCharacterSet {
178 fn name(&self) -> Cow<'static, str> {
179 self.0.name()
180 }
181
182 fn decode(&self, text: &[u8]) -> DecodeResult<String> {
183 self.0.decode(text)
184 }
185
186 fn encode(&self, text: &str) -> EncodeResult<Vec<u8>> {
187 self.0.encode(text)
188 }
189}
190
191#[derive(Debug, Default, Copy, Clone, Eq, PartialEq, PartialOrd, Ord)]
193#[non_exhaustive]
194enum CharsetImpl {
195 #[default]
197 Default,
198 IsoIr13,
200 IsoIr87,
202 IsoIr100,
205 IsoIr101,
208 IsoIr109,
211 IsoIr110,
214 IsoIr126,
216 IsoIr127,
218 IsoIr138,
220 IsoIr144,
222 IsoIr149,
224 IsoIr166,
226 IsoIr192,
228 Gb18030,
230 }
232
233impl CharsetImpl {
234 pub fn from_code(uid: &str) -> Option<Self> {
239 use self::CharsetImpl::*;
240 match uid.trim_end() {
241 "Default" | "ISO_IR_6" | "ISO_IR 6" | "ISO 2022 IR 6" => Some(Default),
242 "ISO_IR_13" | "ISO_IR 13" | "ISO 2022 IR 13" => Some(IsoIr13),
243 "ISO_IR_87" | "ISO_IR 87" | "ISO 2022 IR 87" => Some(IsoIr87),
244 "ISO_IR_100" | "ISO_IR 100" | "ISO 2022 IR 100" => Some(IsoIr100),
245 "ISO_IR_101" | "ISO_IR 101" | "ISO 2022 IR 101" => Some(IsoIr101),
246 "ISO_IR_109" | "ISO_IR 109" | "ISO 2022 IR 109" => Some(IsoIr109),
247 "ISO_IR_110" | "ISO_IR 110" | "ISO 2022 IR 110" => Some(IsoIr110),
248 "ISO_IR_126" | "ISO_IR 126" | "ISO 2022 IR 126" => Some(IsoIr126),
249 "ISO_IR_127" | "ISO_IR 127" | "ISO 2022 IR 127" => Some(IsoIr127),
250 "ISO_IR_138" | "ISO_IR 138" | "ISO 2022 IR 138" => Some(IsoIr138),
251 "ISO_IR_144" | "ISO_IR 144" | "ISO 2022 IR 144" => Some(IsoIr144),
252 "ISO_IR_149" | "ISO_IR 149" | "ISO 2022 IR 149" => Some(IsoIr149),
253 "ISO_IR_166" | "ISO_IR 166" | "ISO 2022 IR 166" => Some(IsoIr166),
254 "ISO_IR_192" | "ISO_IR 192" => Some(IsoIr192),
255 "GB18030" => Some(Gb18030),
256 _ => None,
257 }
258 }
259}
260
261impl TextCodec for CharsetImpl {
262 fn name(&self) -> Cow<'static, str> {
263 Cow::Borrowed(match self {
264 CharsetImpl::Default => "ISO_IR 6",
265 CharsetImpl::IsoIr13 => "ISO_IR 13",
266 CharsetImpl::IsoIr87 => "ISO_IR 87",
267 CharsetImpl::IsoIr100 => "ISO_IR 100",
268 CharsetImpl::IsoIr101 => "ISO_IR 101",
269 CharsetImpl::IsoIr109 => "ISO_IR 109",
270 CharsetImpl::IsoIr110 => "ISO_IR 110",
271 CharsetImpl::IsoIr126 => "ISO_IR 126",
272 CharsetImpl::IsoIr127 => "ISO_IR 127",
273 CharsetImpl::IsoIr138 => "ISO_IR 138",
274 CharsetImpl::IsoIr144 => "ISO_IR 144",
275 CharsetImpl::IsoIr149 => "ISO_IR 149",
276 CharsetImpl::IsoIr166 => "ISO_IR 166",
277 CharsetImpl::IsoIr192 => "ISO_IR 192",
278 CharsetImpl::Gb18030 => "GB18030",
279 })
280 }
281
282 fn decode(&self, text: &[u8]) -> DecodeResult<String> {
283 match self {
284 CharsetImpl::Default => DefaultCharacterSetCodec.decode(text),
285 CharsetImpl::IsoIr13 => IsoIr13CharacterSetCodec.decode(text),
286 CharsetImpl::IsoIr87 => IsoIr87CharacterSetCodec.decode(text),
287 CharsetImpl::IsoIr100 => IsoIr100CharacterSetCodec.decode(text),
288 CharsetImpl::IsoIr101 => IsoIr101CharacterSetCodec.decode(text),
289 CharsetImpl::IsoIr109 => IsoIr109CharacterSetCodec.decode(text),
290 CharsetImpl::IsoIr110 => IsoIr110CharacterSetCodec.decode(text),
291 CharsetImpl::IsoIr126 => IsoIr126CharacterSetCodec.decode(text),
292 CharsetImpl::IsoIr127 => IsoIr127CharacterSetCodec.decode(text),
293 CharsetImpl::IsoIr138 => IsoIr138CharacterSetCodec.decode(text),
294 CharsetImpl::IsoIr144 => IsoIr144CharacterSetCodec.decode(text),
295 CharsetImpl::IsoIr149 => IsoIr149CharacterSetCodec.decode(text),
296 CharsetImpl::IsoIr166 => IsoIr166CharacterSetCodec.decode(text),
297 CharsetImpl::IsoIr192 => Utf8CharacterSetCodec.decode(text),
298 CharsetImpl::Gb18030 => Gb18030CharacterSetCodec.decode(text),
299 }
300 }
301
302 fn encode(&self, text: &str) -> EncodeResult<Vec<u8>> {
303 match self {
304 CharsetImpl::Default => DefaultCharacterSetCodec.encode(text),
305 CharsetImpl::IsoIr13 => IsoIr13CharacterSetCodec.encode(text),
306 CharsetImpl::IsoIr87 => IsoIr87CharacterSetCodec.encode(text),
307 CharsetImpl::IsoIr100 => IsoIr100CharacterSetCodec.encode(text),
308 CharsetImpl::IsoIr101 => IsoIr101CharacterSetCodec.encode(text),
309 CharsetImpl::IsoIr109 => IsoIr109CharacterSetCodec.encode(text),
310 CharsetImpl::IsoIr110 => IsoIr110CharacterSetCodec.encode(text),
311 CharsetImpl::IsoIr126 => IsoIr126CharacterSetCodec.encode(text),
312 CharsetImpl::IsoIr127 => IsoIr127CharacterSetCodec.encode(text),
313 CharsetImpl::IsoIr138 => IsoIr138CharacterSetCodec.encode(text),
314 CharsetImpl::IsoIr144 => IsoIr144CharacterSetCodec.encode(text),
315 CharsetImpl::IsoIr149 => IsoIr149CharacterSetCodec.encode(text),
316 CharsetImpl::IsoIr166 => IsoIr166CharacterSetCodec.encode(text),
317 CharsetImpl::IsoIr192 => Utf8CharacterSetCodec.encode(text),
318 CharsetImpl::Gb18030 => Gb18030CharacterSetCodec.encode(text),
319 }
320 }
321}
322
323fn decode_text_trap(
324 _decoder: &mut dyn RawDecoder,
325 input: &[u8],
326 output: &mut dyn StringWriter,
327) -> bool {
328 let c = input[0];
329 let o0 = c & 7;
330 let o1 = (c & 56) >> 3;
331 let o2 = (c & 192) >> 6;
332 output.write_char('\\');
333 output.write_char((o2 + b'0') as char);
334 output.write_char((o1 + b'0') as char);
335 output.write_char((o0 + b'0') as char);
336 true
337}
338
339macro_rules! decl_character_set {
341 ($typ: ident, $term: literal, $val: expr) => {
342 #[derive(Debug, Default, Copy, Clone, Eq, Hash, PartialEq)]
343 #[doc = "Data type for the "]
344 #[doc = $term]
345 #[doc = "character set encoding."]
346 pub struct $typ;
347
348 impl TextCodec for $typ {
349 fn name(&self) -> Cow<'static, str> {
350 Cow::Borrowed($term)
351 }
352
353 fn decode(&self, text: &[u8]) -> DecodeResult<String> {
354 $val.decode(text, DecoderTrap::Call(decode_text_trap))
355 .map_err(|message| DecodeCustomSnafu { message }.build())
356 }
357
358 fn encode(&self, text: &str) -> EncodeResult<Vec<u8>> {
359 $val.encode(text, EncoderTrap::Strict)
360 .map_err(|message| EncodeCustomSnafu { message }.build())
361 }
362 }
363 };
364}
365
366#[derive(Debug, Default, Copy, Clone, Eq, Hash, PartialEq)]
368pub struct DefaultCharacterSetCodec;
369
370impl TextCodec for DefaultCharacterSetCodec {
371 fn name(&self) -> Cow<'static, str> {
372 Cow::Borrowed("ISO_IR 6")
373 }
374
375 fn decode(&self, text: &[u8]) -> DecodeResult<String> {
376 ISO_8859_1
379 .decode(text, DecoderTrap::Call(decode_text_trap))
380 .map_err(|message| DecodeCustomSnafu { message }.build())
381 }
382
383 fn encode(&self, text: &str) -> EncodeResult<Vec<u8>> {
384 ISO_8859_1
385 .encode(text, EncoderTrap::Strict)
386 .map_err(|message| EncodeCustomSnafu { message }.build())
387 }
388}
389
390decl_character_set!(IsoIr13CharacterSetCodec, "ISO_IR 13", WINDOWS_31J);
391decl_character_set!(IsoIr87CharacterSetCodec, "ISO_IR 87", ISO_2022_JP);
392decl_character_set!(IsoIr100CharacterSetCodec, "ISO_IR 100", ISO_8859_1);
393decl_character_set!(IsoIr101CharacterSetCodec, "ISO_IR 101", ISO_8859_2);
394decl_character_set!(IsoIr109CharacterSetCodec, "ISO_IR 109", ISO_8859_3);
395decl_character_set!(IsoIr110CharacterSetCodec, "ISO_IR 110", ISO_8859_4);
396decl_character_set!(IsoIr126CharacterSetCodec, "ISO_IR 126", ISO_8859_7);
397decl_character_set!(IsoIr127CharacterSetCodec, "ISO_IR 127", ISO_8859_6);
398decl_character_set!(IsoIr138CharacterSetCodec, "ISO_IR 138", ISO_8859_8);
399decl_character_set!(IsoIr144CharacterSetCodec, "ISO_IR 144", ISO_8859_5);
400decl_character_set!(IsoIr149CharacterSetCodec, "ISO_IR 149", WINDOWS_949);
401decl_character_set!(IsoIr166CharacterSetCodec, "ISO_IR 166", WINDOWS_874);
402decl_character_set!(Utf8CharacterSetCodec, "ISO_IR 192", UTF_8);
403decl_character_set!(Gb18030CharacterSetCodec, "GB18030", GB18030);
404
405#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
407pub enum TextValidationOutcome {
408 Ok,
410 BadCharacters,
412 NotOk,
414}
415
416pub fn validate_iso_8859(text: &[u8]) -> TextValidationOutcome {
418 if ISO_8859_1.decode(text, DecoderTrap::Strict).is_err() {
419 match ISO_8859_1.decode(text, DecoderTrap::Call(decode_text_trap)) {
420 Ok(_) => TextValidationOutcome::BadCharacters,
421 Err(_) => TextValidationOutcome::NotOk,
422 }
423 } else {
424 TextValidationOutcome::Ok
425 }
426}
427
428pub fn validate_da(text: &[u8]) -> TextValidationOutcome {
431 if text.iter().cloned().all(|c| c.is_ascii_digit()) {
432 TextValidationOutcome::Ok
433 } else {
434 TextValidationOutcome::NotOk
435 }
436}
437
438pub fn validate_tm(text: &[u8]) -> TextValidationOutcome {
441 if text.iter().cloned().all(|c| match c {
442 b'\\' | b'.' | b'-' | b' ' => true,
443 c => c.is_ascii_digit(),
444 }) {
445 TextValidationOutcome::Ok
446 } else {
447 TextValidationOutcome::NotOk
448 }
449}
450
451pub fn validate_dt(text: &[u8]) -> TextValidationOutcome {
454 if text.iter().cloned().all(|c| match c {
455 b'.' | b'-' | b'+' | b' ' | b'\\' => true,
456 c => c.is_ascii_digit(),
457 }) {
458 TextValidationOutcome::Ok
459 } else {
460 TextValidationOutcome::NotOk
461 }
462}
463
464pub fn validate_cs(text: &[u8]) -> TextValidationOutcome {
467 if text.iter().cloned().all(|c| match c {
468 b' ' | b'_' => true,
469 c => c.is_ascii_digit() || c.is_ascii_uppercase(),
470 }) {
471 TextValidationOutcome::Ok
472 } else {
473 TextValidationOutcome::NotOk
474 }
475}
476
477#[cfg(test)]
478mod tests {
479 use super::*;
480
481 fn test_codec<T>(codec: T, string: &str, bytes: &[u8])
482 where
483 T: TextCodec,
484 {
485 assert_eq!(codec.encode(string).expect("encoding"), bytes);
486 assert_eq!(codec.decode(bytes).expect("decoding"), string);
487 }
488
489 #[test]
490 fn iso_ir_6_baseline() {
491 let codec = SpecificCharacterSet::default();
492 test_codec(codec, "Smith^John", b"Smith^John");
493 }
494
495 #[test]
496 fn iso_ir_13_baseline() {
497 let codec = SpecificCharacterSet(CharsetImpl::IsoIr13);
498 test_codec(codec, "ヤマダ^タロウ", b"\xd4\xcf\xc0\xde^\xc0\xdb\xb3");
499 }
500
501 #[test]
502 fn iso_ir_87_baseline() {
503 let codec = SpecificCharacterSet(CharsetImpl::IsoIr87);
504 test_codec(&codec, "山田^太郎", b"\x1b$B;3ED\x1b(B^\x1b$BB@O:");
505 test_codec(&codec, "やまだ^たろう", b"\x1b$B$d$^$@\x1b(B^\x1b$B$?$m$&");
506 }
507
508 #[test]
509 fn iso_ir_192_baseline() {
510 let codec = SpecificCharacterSet::ISO_IR_192;
511 test_codec(&codec, "Simões^John", "Simões^John".as_bytes());
512 test_codec(codec, "Иванков^Андрей", "Иванков^Андрей".as_bytes());
513 }
514
515 #[test]
516 fn iso_ir_100_baseline() {
517 let codec = SpecificCharacterSet(CharsetImpl::IsoIr100);
518 test_codec(&codec, "Simões^João", b"Sim\xF5es^Jo\xE3o");
519 test_codec(codec, "Günther^Hans", b"G\xfcnther^Hans");
520 }
521
522 #[test]
523 fn iso_ir_101_baseline() {
524 let codec = SpecificCharacterSet(CharsetImpl::IsoIr101);
525 test_codec(codec, "Günther^Hans", b"G\xfcnther^Hans");
526 }
527
528 #[test]
529 fn iso_ir_110_baseline() {
530 let codec = SpecificCharacterSet(CharsetImpl::IsoIr110);
531 test_codec(codec, "ĄĸŖĨϧŠĒĢŦŽĀÁÂÃÄÅÆĮČÉ^ĘËĖÍÎĪĐŅŌĶÔÕÖרŲÚÛÜŨŪß", b"\xA1\xA2\xA3\xA5\xA6\xA7\xA9\xAA\xAB\xAC\xAE\xC0\xC1\xC2\xC3\xC4\xC5\xC6\xC7\xC8\xC9^\xCA\xCB\xCC\xCD\xCE\xCF\xD0\xD1\xD2\xD3\xD4\xD5\xD6\xD7\xD8\xD9\xDA\xDB\xDC\xDD\xDE\xDF");
532 }
533
534 #[test]
535 fn iso_ir_126_baseline() {
536 let codec = SpecificCharacterSet(CharsetImpl::IsoIr126);
537 test_codec(codec, "Διονυσιος", b"\xC4\xE9\xEF\xED\xF5\xF3\xE9\xEF\xF2");
538 }
539
540 #[test]
541 fn iso_ir_127_baseline() {
542 let codec = SpecificCharacterSet(CharsetImpl::IsoIr127);
543 test_codec(
544 codec,
545 "قباني^لنزار",
546 b"\xE2\xC8\xC7\xE6\xEA^\xE4\xE6\xD2\xC7\xD1",
547 );
548 }
549
550 #[test]
551 fn iso_ir_138_baseline() {
552 let codec = SpecificCharacterSet(CharsetImpl::IsoIr138);
553 test_codec(
554 &codec,
555 "מקור השם עברית",
556 b"\xEE\xF7\xE5\xF8\x20\xE4\xF9\xED\x20\xF2\xE1\xF8\xE9\xFA",
557 );
558 test_codec(
559 codec,
560 "שרון^דבורה",
561 b"\xF9\xF8\xE5\xEF^\xE3\xE1\xE5\xF8\xE4",
562 );
563 }
564
565 #[test]
566 fn iso_ir_144_baseline() {
567 let codec = SpecificCharacterSet(CharsetImpl::IsoIr144);
568 test_codec(
569 &codec,
570 "Иванков^Андрей",
571 b"\xb8\xd2\xd0\xdd\xda\xde\xd2^\xb0\xdd\xd4\xe0\xd5\xd9",
572 );
573 test_codec(
574 &codec,
575 "Гол. мозг стандарт",
576 b"\xB3\xDE\xDB.\x20\xDC\xDE\xD7\xD3\x20\xE1\xE2\xD0\xDD\xD4\xD0\xE0\xE2",
577 );
578 test_codec(&codec, "мозг 2мм", b"\xDC\xDE\xD7\xD3\x202\xDC\xDC");
579 }
580
581 #[test]
582 fn iso_ir_149_baseline() {
583 let codec = SpecificCharacterSet(CharsetImpl::IsoIr149);
584 test_codec(&codec, "김희중", b"\xB1\xE8\xC8\xF1\xC1\xDF");
585 test_codec(
586 codec,
587 "Hong^Gildong=洪^吉洞=홍^길동",
588 b"Hong^Gildong=\xFB\xF3^\xD1\xCE\xD4\xD7=\xC8\xAB^\xB1\xE6\xB5\xBF",
589 );
590 }
591
592 #[test]
593 fn iso_ir_166_baseline() {
594 let codec = SpecificCharacterSet(CharsetImpl::IsoIr166);
595 test_codec(&codec, "ประเทศไทย", b"\xBB\xC3\xD0\xE0\xB7\xC8\xE4\xB7\xC2");
596 test_codec(codec, "รหัสสำหรับอักขระไทยที่ใช้กับคอมพิวเตอร์", b"\xC3\xCB\xD1\xCA\xCA\xD3\xCB\xC3\xD1\xBA\xCD\xD1\xA1\xA2\xC3\xD0\xE4\xB7\xC2\xB7\xD5\xE8\xE3\xAA\xE9\xA1\xD1\xBA\xA4\xCD\xC1\xBE\xD4\xC7\xE0\xB5\xCD\xC3\xEC");
597 }
598
599 #[test]
600 fn gb_18030_baseline() {
601 let codec = SpecificCharacterSet(CharsetImpl::Gb18030);
602 test_codec(
603 &codec,
604 "Wang^XiaoDong=王^小东",
605 b"Wang^XiaoDong=\xCD\xF5^\xD0\xA1\xB6\xAB",
606 );
607 }
608}