mod iso_8859;
mod shift_jis;
mod ucs4;
mod us_ascii;
mod utf16;
use std::{
borrow::Cow,
collections::BTreeMap,
str::{from_utf8, from_utf8_unchecked},
sync::{LazyLock, RwLock},
};
pub use crate::encoding::{iso_8859::*, shift_jis::*, ucs4::*, us_ascii::*, utf16::*};
pub trait Encoder {
fn name(&self) -> &'static str;
fn encode(
&mut self,
src: &str,
dst: &mut [u8],
finish: bool,
) -> Result<(usize, usize), EncodeError>;
}
pub trait Decoder {
fn name(&self) -> &'static str;
fn decode(
&mut self,
src: &[u8],
dst: &mut String,
finish: bool,
) -> Result<(usize, usize), DecodeError>;
}
#[derive(Debug, Clone)]
pub enum EncodeError {
InputIsEmpty,
OutputTooShort,
Unmappable { read: usize, write: usize, c: char },
Other { msg: Cow<'static, str> },
}
#[derive(Debug, Clone)]
pub enum DecodeError {
InputIsEmpty,
OutputTooShort,
Malformed {
read: usize,
write: usize,
length: usize,
offset: usize,
},
Other { msg: Cow<'static, str> },
}
pub const UTF8_NAME: &str = "UTF-8";
pub struct UTF8Encoder;
impl Encoder for UTF8Encoder {
fn name(&self) -> &'static str {
UTF8_NAME
}
fn encode(
&mut self,
src: &str,
dst: &mut [u8],
finish: bool,
) -> Result<(usize, usize), EncodeError> {
if src.is_empty() {
return if finish {
Ok((0, 0))
} else {
Err(EncodeError::InputIsEmpty)
};
}
if finish && src.len() > dst.len() {
return Err(EncodeError::OutputTooShort);
}
let len = src.len().min(dst.len());
dst[..len].copy_from_slice(&src.as_bytes()[..len]);
Ok((len, len))
}
}
pub struct UTF8Decoder;
impl Decoder for UTF8Decoder {
fn name(&self) -> &'static str {
UTF8_NAME
}
fn decode(
&mut self,
src: &[u8],
dst: &mut String,
finish: bool,
) -> Result<(usize, usize), DecodeError> {
if src.is_empty() {
return Err(DecodeError::InputIsEmpty);
}
let len = dst.capacity() - dst.len();
if len < 4 {
return Err(DecodeError::OutputTooShort);
}
let len = len.min(src.len());
match from_utf8(&src[..len]) {
Ok(s) => {
dst.push_str(s);
Ok((len, len))
}
Err(err) => {
let up_to = err.valid_up_to();
dst.push_str(unsafe {
from_utf8_unchecked(&src[..up_to])
});
match err.error_len() {
Some(len) => Err(DecodeError::Malformed {
read: up_to + len,
write: up_to,
length: len,
offset: 0,
}),
None => {
if finish {
Err(DecodeError::Malformed {
read: len,
write: up_to,
length: len - up_to,
offset: 0,
})
} else {
Ok((up_to, up_to))
}
}
}
}
}
}
}
pub const DEFAULT_SUPPORTED_ENCODINGS: &[&str] = {
const NAMES: &[&str] = &[
ISO_8859_10_NAME,
ISO_8859_13_NAME,
ISO_8859_14_NAME,
ISO_8859_15_NAME,
ISO_8859_16_NAME,
ISO_8859_1_NAME,
ISO_8859_2_NAME,
ISO_8859_3_NAME,
ISO_8859_4_NAME,
ISO_8859_5_NAME,
ISO_8859_6_NAME,
ISO_8859_7_NAME,
ISO_8859_8_NAME,
ISO_8859_9_NAME,
SHIFT_JIS_NAME,
ISO_8859_11_NAME,
US_ASCII_NAME,
UTF16_NAME,
UTF16BE_NAME,
UTF16LE_NAME,
UTF32_NAME,
UTF32BE_NAME,
UTF32LE_NAME,
UTF8_NAME,
];
let len = NAMES.len();
let mut i = 0;
while i + 1 < len {
let x = NAMES[i].as_bytes();
let y = NAMES[i + 1].as_bytes();
let mut j = 0;
while j < x.len() {
assert!(x[j] <= y[j]);
if x[j] < y[j] {
break;
}
j += 1;
if j == x.len() {
break;
}
assert!(j < y.len());
}
i += 1;
}
NAMES
};
pub static ENCODING_ALIASES: LazyLock<RwLock<BTreeMap<Cow<'static, str>, &'static str>>> =
LazyLock::new(|| {
RwLock::new(BTreeMap::from([
("UTF8".into(), UTF8_NAME),
("UTF16".into(), UTF16_NAME),
("UTF16BE".into(), UTF16BE_NAME),
("UTF16LE".into(), UTF16LE_NAME),
("ISO-IR-100".into(), ISO_8859_1_NAME),
("ISO_8859-1".into(), ISO_8859_1_NAME),
("ISO-8859-1".into(), ISO_8859_1_NAME),
("LATIN1".into(), ISO_8859_1_NAME),
("L1".into(), ISO_8859_1_NAME),
("IBM819".into(), ISO_8859_1_NAME),
("CP819".into(), ISO_8859_1_NAME),
("ISOLATIN1".into(), ISO_8859_1_NAME),
("ISO-IR-101".into(), ISO_8859_2_NAME),
("ISO_8859-2".into(), ISO_8859_2_NAME),
("ISO-8859-2".into(), ISO_8859_2_NAME),
("LATIN2".into(), ISO_8859_2_NAME),
("L2".into(), ISO_8859_2_NAME),
("ISOLATIN2".into(), ISO_8859_2_NAME),
("ISO-IR-109".into(), ISO_8859_3_NAME),
("ISO_8859-3".into(), ISO_8859_3_NAME),
("ISO-8859-3".into(), ISO_8859_3_NAME),
("LATIN3".into(), ISO_8859_3_NAME),
("L3".into(), ISO_8859_3_NAME),
("ISOLATIN3".into(), ISO_8859_3_NAME),
("ISO-IR-110".into(), ISO_8859_4_NAME),
("ISO_8859-4".into(), ISO_8859_4_NAME),
("ISO-8859-4".into(), ISO_8859_4_NAME),
("LATIN4".into(), ISO_8859_4_NAME),
("L4".into(), ISO_8859_4_NAME),
("ISOLATIN4".into(), ISO_8859_4_NAME),
("ISO-IR-144".into(), ISO_8859_5_NAME),
("ISO_8859-5".into(), ISO_8859_5_NAME),
("ISO-8859-5".into(), ISO_8859_5_NAME),
("CYRILLIC".into(), ISO_8859_5_NAME),
("ISOLATINCYRILLIC".into(), ISO_8859_5_NAME),
("ISO-IR-127".into(), ISO_8859_6_NAME),
("ISO_8859-6".into(), ISO_8859_6_NAME),
("ISO-8859-6".into(), ISO_8859_6_NAME),
("ECMA-114".into(), ISO_8859_6_NAME),
("ASMO-708".into(), ISO_8859_6_NAME),
("ARABIC".into(), ISO_8859_6_NAME),
("ISOLATINARABIC".into(), ISO_8859_6_NAME),
("ISO-IR-126".into(), ISO_8859_7_NAME),
("ISO_8859-7".into(), ISO_8859_7_NAME),
("ISO-8859-7".into(), ISO_8859_7_NAME),
("ELOT_928".into(), ISO_8859_7_NAME),
("ECMA-118".into(), ISO_8859_7_NAME),
("GREEK".into(), ISO_8859_7_NAME),
("GREEK8".into(), ISO_8859_7_NAME),
("ISOLATINGREEK".into(), ISO_8859_7_NAME),
("ISO-IR-138".into(), ISO_8859_8_NAME),
("ISO_8859-8".into(), ISO_8859_8_NAME),
("ISO-8859-8".into(), ISO_8859_8_NAME),
("HEBREW".into(), ISO_8859_8_NAME),
("ISOLATINHEBREW".into(), ISO_8859_8_NAME),
("ISO-IR-148".into(), ISO_8859_9_NAME),
("ISO_8859-9".into(), ISO_8859_9_NAME),
("ISO-8859-9".into(), ISO_8859_9_NAME),
("LATIN5".into(), ISO_8859_9_NAME),
("L5".into(), ISO_8859_9_NAME),
("ISOLATIN5".into(), ISO_8859_9_NAME),
("ISO-IR-157".into(), ISO_8859_10_NAME),
("L6".into(), ISO_8859_10_NAME),
("ISO_8859-10:1992".into(), ISO_8859_10_NAME),
("ISOLATIN6".into(), ISO_8859_10_NAME),
("LATIN6".into(), ISO_8859_10_NAME),
("TIS620".into(), ISO_8859_11_NAME),
("ISO-8859-11".into(), ISO_8859_11_NAME),
("ISO885913".into(), ISO_8859_13_NAME),
("ISO-IR-199".into(), ISO_8859_14_NAME),
("ISO_8859-14:1998".into(), ISO_8859_14_NAME),
("ISO_8859-14".into(), ISO_8859_14_NAME),
("LATIN8".into(), ISO_8859_14_NAME),
("ISO-CELTIC".into(), ISO_8859_14_NAME),
("L8".into(), ISO_8859_14_NAME),
("ISO885914".into(), ISO_8859_14_NAME),
("ISO_8859-15".into(), ISO_8859_15_NAME),
("LATIN-9".into(), ISO_8859_15_NAME),
("ISO885915".into(), ISO_8859_15_NAME),
("ISO-IR-226".into(), ISO_8859_16_NAME),
("ISO_8859-16:2001".into(), ISO_8859_16_NAME),
("ISO_8859-16".into(), ISO_8859_16_NAME),
("LATIN10".into(), ISO_8859_16_NAME),
("L10".into(), ISO_8859_16_NAME),
("ISO885916".into(), ISO_8859_16_NAME),
("UTF32".into(), UTF32_NAME),
("UTF32BE".into(), UTF32BE_NAME),
("UTF32LE".into(), UTF32LE_NAME),
("MS_KANJI".into(), SHIFT_JIS_NAME),
("SHIFTJIS".into(), SHIFT_JIS_NAME),
("ISO-IR-6".into(), US_ASCII_NAME),
("ANSI_X3.4-1968".into(), US_ASCII_NAME),
("ANSI_X3.4-1986".into(), US_ASCII_NAME),
("ISO_646.IRV:1991".into(), US_ASCII_NAME),
("ISO646-US".into(), US_ASCII_NAME),
("US-ASCII".into(), US_ASCII_NAME),
("US".into(), US_ASCII_NAME),
("IBM367".into(), US_ASCII_NAME),
("CP367".into(), US_ASCII_NAME),
("ASCII".into(), US_ASCII_NAME),
]))
});
pub fn register_encoding_alias(alias: &'static str, real: &'static str) -> Option<&'static str> {
let mut table = ENCODING_ALIASES.write().unwrap();
if alias.chars().all(|c| c.is_ascii_uppercase()) {
table.insert(alias.into(), real)
} else {
table.insert(alias.to_ascii_uppercase().into(), real)
}
}
pub fn unregister_encoding_alias(alias: &'static str) -> Option<&'static str> {
ENCODING_ALIASES
.write()
.unwrap()
.remove(alias.to_ascii_uppercase().as_str())
}
pub fn resolve_encoding_alias(alias: &str) -> Option<&'static str> {
let aliases = ENCODING_ALIASES.read().unwrap();
aliases
.get(alias)
.or_else(|| aliases.get(alias.to_ascii_uppercase().as_str()))
.copied()
}
pub type EncoderFactory = fn() -> Box<dyn Encoder>;
pub static ENCODER_TABLE: LazyLock<RwLock<BTreeMap<&'static str, EncoderFactory>>> =
LazyLock::new(|| {
let mut map = BTreeMap::<&'static str, EncoderFactory>::new();
map.insert(UTF8_NAME, || Box::new(UTF8Encoder));
map.insert(UTF16_NAME, || Box::new(UTF16Encoder::default()));
map.insert(UTF16BE_NAME, || Box::new(UTF16BEEncoder));
map.insert(UTF16LE_NAME, || Box::new(UTF16LEEncoder));
map.insert(ISO_8859_1_NAME, || Box::new(ISO8859_1Encoder));
map.insert(ISO_8859_2_NAME, || Box::new(ISO8859_2Encoder));
map.insert(ISO_8859_3_NAME, || Box::new(ISO8859_3Encoder));
map.insert(ISO_8859_4_NAME, || Box::new(ISO8859_4Encoder));
map.insert(ISO_8859_5_NAME, || Box::new(ISO8859_5Encoder));
map.insert(ISO_8859_6_NAME, || Box::new(ISO8859_6Encoder));
map.insert(ISO_8859_7_NAME, || Box::new(ISO8859_7Encoder));
map.insert(ISO_8859_8_NAME, || Box::new(ISO8859_8Encoder));
map.insert(ISO_8859_9_NAME, || Box::new(ISO8859_9Encoder));
map.insert(ISO_8859_10_NAME, || Box::new(ISO8859_10Encoder));
map.insert(ISO_8859_11_NAME, || Box::new(ISO8859_11Encoder));
map.insert(ISO_8859_13_NAME, || Box::new(ISO8859_13Encoder));
map.insert(ISO_8859_14_NAME, || Box::new(ISO8859_14Encoder));
map.insert(ISO_8859_15_NAME, || Box::new(ISO8859_15Encoder));
map.insert(ISO_8859_16_NAME, || Box::new(ISO8859_16Encoder));
map.insert(UTF32_NAME, || Box::new(UTF32Encoder::default()));
map.insert(UTF32BE_NAME, || Box::new(UTF32BEEncoder));
map.insert(UTF32LE_NAME, || Box::new(UTF32LEEncoder));
map.insert(SHIFT_JIS_NAME, || Box::new(ShiftJISEncoder));
map.insert(US_ASCII_NAME, || Box::new(USASCIIEncoder));
RwLock::new(map)
});
pub fn find_encoder(encoding_name: &str) -> Option<Box<dyn Encoder>> {
let table = ENCODER_TABLE.read().unwrap();
if let Some(factory) = table.get(encoding_name) {
return Some(factory());
}
if let Some(factory) = table.get(encoding_name.to_ascii_uppercase().as_str()) {
return Some(factory());
}
let alias = resolve_encoding_alias(encoding_name)?;
table.get(alias).map(|f| f())
}
pub fn register_encoder(
encoding_name: &'static str,
factory: EncoderFactory,
) -> Option<EncoderFactory> {
ENCODER_TABLE
.write()
.unwrap()
.insert(encoding_name, factory)
}
pub fn unregister_encoder(encoding_name: &str) -> Option<EncoderFactory> {
ENCODER_TABLE.write().unwrap().remove(encoding_name)
}
pub type DecoderFactory = fn() -> Box<dyn Decoder>;
pub static DECODER_TABLE: LazyLock<RwLock<BTreeMap<&'static str, DecoderFactory>>> =
LazyLock::new(|| {
let mut map = BTreeMap::<&'static str, DecoderFactory>::new();
map.insert(UTF8_NAME, || Box::new(UTF8Decoder));
map.insert(UTF16_NAME, || Box::new(UTF16Decoder::default()));
map.insert(UTF16BE_NAME, || Box::new(UTF16BEDecoder));
map.insert(UTF16LE_NAME, || Box::new(UTF16LEDecoder));
map.insert(ISO_8859_1_NAME, || Box::new(ISO8859_1Decoder));
map.insert(ISO_8859_2_NAME, || Box::new(ISO8859_2Decoder));
map.insert(ISO_8859_3_NAME, || Box::new(ISO8859_3Decoder));
map.insert(ISO_8859_4_NAME, || Box::new(ISO8859_4Decoder));
map.insert(ISO_8859_5_NAME, || Box::new(ISO8859_5Decoder));
map.insert(ISO_8859_6_NAME, || Box::new(ISO8859_6Decoder));
map.insert(ISO_8859_7_NAME, || Box::new(ISO8859_7Decoder));
map.insert(ISO_8859_8_NAME, || Box::new(ISO8859_8Decoder));
map.insert(ISO_8859_9_NAME, || Box::new(ISO8859_9Decoder));
map.insert(ISO_8859_10_NAME, || Box::new(ISO8859_10Decoder));
map.insert(ISO_8859_11_NAME, || Box::new(ISO8859_11Decoder));
map.insert(ISO_8859_13_NAME, || Box::new(ISO8859_13Decoder));
map.insert(ISO_8859_14_NAME, || Box::new(ISO8859_14Decoder));
map.insert(ISO_8859_15_NAME, || Box::new(ISO8859_15Decoder));
map.insert(ISO_8859_16_NAME, || Box::new(ISO8859_16Decoder));
map.insert(UTF32_NAME, || Box::new(UTF32Decoder::default()));
map.insert(UTF32BE_NAME, || Box::new(UTF32BEDecoder));
map.insert(UTF32LE_NAME, || Box::new(UTF32LEDecoder));
map.insert(SHIFT_JIS_NAME, || Box::new(ShiftJISDecoder));
map.insert(US_ASCII_NAME, || Box::new(USASCIIDecoder));
RwLock::new(map)
});
pub fn find_decoder(encoding_name: &str) -> Option<Box<dyn Decoder>> {
let table = DECODER_TABLE.read().unwrap();
if let Some(factory) = table.get(encoding_name) {
return Some(factory());
}
if let Some(factory) = table.get(encoding_name.to_ascii_uppercase().as_str()) {
return Some(factory());
}
let alias = resolve_encoding_alias(encoding_name)?;
table.get(alias).map(|f| f())
}
pub fn register_decoder(
encoding_name: &'static str,
factory: DecoderFactory,
) -> Option<DecoderFactory> {
DECODER_TABLE
.write()
.unwrap()
.insert(encoding_name, factory)
}
pub fn unregister_decoder(encoding_name: &str) -> Option<DecoderFactory> {
DECODER_TABLE.write().unwrap().remove(encoding_name)
}