use encoding::all::{GB18030, ISO_8859_1, ISO_8859_2, ISO_8859_3, ISO_8859_4, ISO_8859_5, UTF_8};
use encoding::{DecoderTrap, EncoderTrap, Encoding, RawDecoder, StringWriter};
use snafu::{Backtrace, Snafu};
use std::borrow::Cow;
use std::fmt::Debug;
#[derive(Debug, Snafu)]
#[non_exhaustive]
pub enum EncodeTextError {
#[snafu(display("{}", message))]
EncodeCustom {
message: Cow<'static, str>,
backtrace: Backtrace,
},
}
#[derive(Debug, Snafu)]
#[non_exhaustive]
pub enum DecodeTextError {
#[snafu(display("{}", message))]
DecodeCustom {
message: Cow<'static, str>,
backtrace: Backtrace,
},
}
type EncodeResult<T> = Result<T, EncodeTextError>;
type DecodeResult<T> = Result<T, DecodeTextError>;
pub trait TextCodec {
fn name(&self) -> &'static str;
fn decode(&self, text: &[u8]) -> DecodeResult<String>;
fn encode(&self, text: &str) -> EncodeResult<Vec<u8>>;
}
impl<T: ?Sized> TextCodec for Box<T>
where
T: TextCodec,
{
fn name(&self) -> &'static str {
self.as_ref().name()
}
fn decode(&self, text: &[u8]) -> DecodeResult<String> {
self.as_ref().decode(text)
}
fn encode(&self, text: &str) -> EncodeResult<Vec<u8>> {
self.as_ref().encode(text)
}
}
impl<'a, T: ?Sized> TextCodec for &'a T
where
T: TextCodec,
{
fn name(&self) -> &'static str {
(**self).name()
}
fn decode(&self, text: &[u8]) -> DecodeResult<String> {
(**self).decode(text)
}
fn encode(&self, text: &str) -> EncodeResult<Vec<u8>> {
(**self).encode(text)
}
}
pub type DynamicTextCodec = Box<dyn TextCodec>;
#[derive(Debug, Copy, Clone, Eq, PartialEq, PartialOrd, Ord)]
#[non_exhaustive]
pub enum SpecificCharacterSet {
Default,
IsoIr100,
IsoIr101,
IsoIr109,
IsoIr110,
IsoIr144,
IsoIr192,
GB18030,
}
impl Default for SpecificCharacterSet {
fn default() -> Self {
SpecificCharacterSet::Default
}
}
impl SpecificCharacterSet {
pub fn from_code(uid: &str) -> Option<Self> {
use self::SpecificCharacterSet::*;
match uid.trim_end() {
"Default" | "ISO_IR_6" | "ISO_IR 6" | "ISO 2022 IR 6" => Some(Default),
"ISO_IR_100" | "ISO_IR 100" | "ISO 2022 IR 100" => Some(IsoIr100),
"ISO_IR_101" | "ISO_IR 101" | "ISO 2022 IR 101" => Some(IsoIr101),
"ISO_IR_109" | "ISO_IR 109" | "ISO 2022 IR 109" => Some(IsoIr109),
"ISO_IR_110" | "ISO_IR 110" | "ISO 2022 IR 110" => Some(IsoIr110),
"ISO_IR_144" | "ISO_IR 144" | "ISO 2022 IR 144" => Some(IsoIr144),
"ISO_IR_192" | "ISO_IR 192" => Some(IsoIr192),
"GB18030" => Some(GB18030),
_ => None,
}
}
pub fn codec(self) -> Option<DynamicTextCodec> {
match self {
SpecificCharacterSet::Default => Some(Box::new(DefaultCharacterSetCodec)),
SpecificCharacterSet::IsoIr100 => Some(Box::new(IsoIr100CharacterSetCodec)),
SpecificCharacterSet::IsoIr101 => Some(Box::new(IsoIr101CharacterSetCodec)),
SpecificCharacterSet::IsoIr109 => Some(Box::new(IsoIr109CharacterSetCodec)),
SpecificCharacterSet::IsoIr110 => Some(Box::new(IsoIr110CharacterSetCodec)),
SpecificCharacterSet::IsoIr144 => Some(Box::new(IsoIr144CharacterSetCodec)),
SpecificCharacterSet::IsoIr192 => Some(Box::new(Utf8CharacterSetCodec)),
SpecificCharacterSet::GB18030 => Some(Box::new(Gb18030CharacterSetCodec)),
}
}
}
fn decode_text_trap(
_decoder: &mut dyn RawDecoder,
input: &[u8],
output: &mut dyn StringWriter,
) -> bool {
let c = input[0];
let o0 = c & 7;
let o1 = (c & 56) >> 3;
let o2 = (c & 192) >> 6;
output.write_char('\\');
output.write_char((o2 + b'0') as char);
output.write_char((o1 + b'0') as char);
output.write_char((o0 + b'0') as char);
true
}
macro_rules! decl_character_set {
($typ: ident, $term: literal, $val: expr) => {
#[derive(Debug, Default, Copy, Clone, Eq, Hash, PartialEq)]
#[doc = "Data type for the "]
#[doc = $term]
#[doc = "character set encoding."]
pub struct $typ;
impl TextCodec for $typ {
fn name(&self) -> &'static str {
$term
}
fn decode(&self, text: &[u8]) -> DecodeResult<String> {
$val.decode(text, DecoderTrap::Call(decode_text_trap))
.map_err(|message| DecodeCustom { message }.build())
}
fn encode(&self, text: &str) -> EncodeResult<Vec<u8>> {
$val.encode(text, EncoderTrap::Strict)
.map_err(|message| EncodeCustom { message }.build())
}
}
};
}
#[derive(Debug, Default, Copy, Clone, Eq, Hash, PartialEq)]
pub struct DefaultCharacterSetCodec;
impl TextCodec for DefaultCharacterSetCodec {
fn name(&self) -> &'static str {
"ISO_IR 6"
}
fn decode(&self, text: &[u8]) -> DecodeResult<String> {
ISO_8859_1
.decode(text, DecoderTrap::Call(decode_text_trap))
.map_err(|message| DecodeCustom { message }.build())
}
fn encode(&self, text: &str) -> EncodeResult<Vec<u8>> {
ISO_8859_1
.encode(text, EncoderTrap::Strict)
.map_err(|message| EncodeCustom { message }.build())
}
}
decl_character_set!(IsoIr100CharacterSetCodec, "ISO_IR 100", ISO_8859_1);
decl_character_set!(IsoIr101CharacterSetCodec, "ISO_IR 101", ISO_8859_2);
decl_character_set!(IsoIr109CharacterSetCodec, "ISO_IR 109", ISO_8859_3);
decl_character_set!(IsoIr110CharacterSetCodec, "ISO_IR 110", ISO_8859_4);
decl_character_set!(IsoIr144CharacterSetCodec, "ISO_IR 144", ISO_8859_5);
decl_character_set!(Utf8CharacterSetCodec, "ISO_IR 192", UTF_8);
decl_character_set!(Gb18030CharacterSetCodec, "GB18030", GB18030);
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
pub enum TextValidationOutcome {
Ok,
BadCharacters,
NotOk,
}
pub fn validate_iso_8859(text: &[u8]) -> TextValidationOutcome {
if ISO_8859_1.decode(text, DecoderTrap::Strict).is_err() {
match ISO_8859_1.decode(text, DecoderTrap::Call(decode_text_trap)) {
Ok(_) => TextValidationOutcome::BadCharacters,
Err(_) => TextValidationOutcome::NotOk,
}
} else {
TextValidationOutcome::Ok
}
}
pub fn validate_da(text: &[u8]) -> TextValidationOutcome {
if text.iter().cloned().all(|c| (b'0'..=b'9').contains(&c)) {
TextValidationOutcome::Ok
} else {
TextValidationOutcome::NotOk
}
}
pub fn validate_tm(text: &[u8]) -> TextValidationOutcome {
if text.iter().cloned().all(|c| match c {
b'\\' | b'.' | b'-' | b' ' => true,
c => (b'0'..=b'9').contains(&c),
}) {
TextValidationOutcome::Ok
} else {
TextValidationOutcome::NotOk
}
}
pub fn validate_dt(text: &[u8]) -> TextValidationOutcome {
if text.iter().cloned().all(|c| match c {
b'.' | b'-' | b'+' | b' ' | b'\\' => true,
c => (b'0'..=b'9').contains(&c),
}) {
TextValidationOutcome::Ok
} else {
TextValidationOutcome::NotOk
}
}
pub fn validate_cs(text: &[u8]) -> TextValidationOutcome {
if text.iter().cloned().all(|c| match c {
b' ' | b'_' => true,
c => (b'0'..=b'9').contains(&c) || (b'A'..=b'Z').contains(&c),
}) {
TextValidationOutcome::Ok
} else {
TextValidationOutcome::NotOk
}
}
#[cfg(test)]
mod tests {
use super::*;
fn test_codec<T>(codec: T, string: &str, bytes: &[u8])
where
T: TextCodec,
{
assert_eq!(codec.encode(string).expect("encoding"), bytes);
assert_eq!(codec.decode(bytes).expect("decoding"), string);
}
#[test]
fn iso_ir_6_baseline() {
let codec = SpecificCharacterSet::Default
.codec()
.expect("Must be fully supported");
test_codec(codec, "Smith^John", b"Smith^John");
}
#[test]
fn iso_ir_192_baseline() {
let codec = SpecificCharacterSet::IsoIr192
.codec()
.expect("Should be fully supported");
test_codec(&codec, "Simões^John", "Simões^John".as_bytes());
test_codec(codec, "Иванков^Андрей", "Иванков^Андрей".as_bytes());
}
#[test]
fn iso_ir_100_baseline() {
let codec = SpecificCharacterSet::IsoIr100
.codec()
.expect("Should be fully supported");
test_codec(&codec, "Simões^João", b"Sim\xF5es^Jo\xE3o");
test_codec(codec, "Günther^Hans", b"G\xfcnther^Hans");
}
#[test]
fn iso_ir_101_baseline() {
let codec = SpecificCharacterSet::IsoIr101
.codec()
.expect("Should be fully supported");
test_codec(codec, "Günther^Hans", b"G\xfcnther^Hans");
}
#[test]
fn iso_ir_144_baseline() {
let codec = SpecificCharacterSet::IsoIr144
.codec()
.expect("Should be fully supported");
test_codec(
codec,
"Иванков^Андрей",
b"\xb8\xd2\xd0\xdd\xda\xde\xd2^\xb0\xdd\xd4\xe0\xd5\xd9",
);
}
}