use std::{
borrow::Cow,
collections::BTreeMap,
iter::once,
str::{from_utf8, from_utf8_unchecked},
sync::{LazyLock, RwLock},
};
pub trait Encoder {
fn name(&self) -> &'static str;
fn encode(
&mut self,
src: &str,
dst: &mut [u8],
finish: bool,
) -> Result<(usize, usize), EncodeError>;
}
pub trait Decoder {
fn name(&self) -> &'static str;
fn decode(
&mut self,
src: &[u8],
dst: &mut String,
finish: bool,
) -> Result<(usize, usize), DecodeError>;
}
#[derive(Debug)]
pub enum EncodeError {
InputIsEmpty,
OutputTooShort,
Unmappable { read: usize, write: usize, c: char },
Other { msg: Cow<'static, str> },
}
#[derive(Debug)]
pub enum DecodeError {
InputIsEmpty,
OutputTooShort,
Malformed {
read: usize,
write: usize,
length: usize,
offset: usize,
},
Other { msg: Cow<'static, str> },
}
pub const UTF8_NAME: &str = "UTF-8";
pub struct UTF8Encoder;
impl Encoder for UTF8Encoder {
fn name(&self) -> &'static str {
UTF8_NAME
}
fn encode(
&mut self,
src: &str,
dst: &mut [u8],
finish: bool,
) -> Result<(usize, usize), EncodeError> {
if src.is_empty() {
return if finish {
Ok((0, 0))
} else {
Err(EncodeError::InputIsEmpty)
};
}
if finish && src.len() > dst.len() {
return Err(EncodeError::OutputTooShort);
}
let len = src.len().min(dst.len());
dst[..len].copy_from_slice(&src.as_bytes()[..len]);
Ok((len, len))
}
}
pub struct UTF8Decoder;
impl Decoder for UTF8Decoder {
fn name(&self) -> &'static str {
UTF8_NAME
}
fn decode(
&mut self,
src: &[u8],
dst: &mut String,
finish: bool,
) -> Result<(usize, usize), DecodeError> {
if src.is_empty() {
return Err(DecodeError::InputIsEmpty);
}
let len = dst.capacity() - dst.len();
if len < 4 {
return Err(DecodeError::OutputTooShort);
}
let len = len.min(src.len());
match from_utf8(&src[..len]) {
Ok(s) => {
dst.push_str(s);
Ok((len, len))
}
Err(err) => {
let up_to = err.valid_up_to();
dst.push_str(unsafe {
from_utf8_unchecked(&src[..up_to])
});
match err.error_len() {
Some(len) => Err(DecodeError::Malformed {
read: up_to + len,
write: up_to,
length: len,
offset: 0,
}),
None => {
if finish {
Err(DecodeError::Malformed {
read: len,
write: up_to,
length: len - up_to,
offset: 0,
})
} else {
Ok((up_to, up_to))
}
}
}
}
}
}
}
pub const UTF16_NAME: &str = "UTF-16";
pub struct UTF16Encoder {
init: bool,
}
impl Encoder for UTF16Encoder {
fn name(&self) -> &'static str {
UTF16_NAME
}
fn encode(
&mut self,
src: &str,
dst: &mut [u8],
finish: bool,
) -> Result<(usize, usize), EncodeError> {
if src.is_empty() {
return Err(EncodeError::InputIsEmpty);
}
if dst.len() < 4 {
return Err(EncodeError::OutputTooShort);
}
if !self.init {
self.init = true;
dst[0] = 0xFF;
dst[1] = 0xFE;
return Ok((0, 2));
}
UTF16LEEncoder.encode(src, dst, finish)
}
}
pub struct UTF16Decoder {
read: usize,
top: [u8; 2],
be: bool,
}
impl Decoder for UTF16Decoder {
fn name(&self) -> &'static str {
UTF16_NAME
}
fn decode(
&mut self,
mut src: &[u8],
dst: &mut String,
finish: bool,
) -> Result<(usize, usize), DecodeError> {
if src.is_empty() {
return Err(DecodeError::InputIsEmpty);
}
if dst.capacity() - dst.len() < 4 {
return Err(DecodeError::OutputTooShort);
}
let mut base = 0;
if self.read < 2 {
let orig = src.len();
while self.read < 2 && !src.is_empty() {
self.top[self.read] = src[0];
src = &src[1..];
self.read += 1;
}
base = orig - src.len();
if self.read == 2 {
if matches!(self.top[..], [0xFF, 0xFE]) {
self.be = false;
return Ok((base, 0));
} else if matches!(self.top[..], [0xFE, 0xFF]) {
self.be = true;
return Ok((base, 0));
} else {
self.be = true;
};
} else {
return Ok((base, 0));
}
}
if self.be && !matches!(self.top[..], [0xFE, 0xFF]) {
let mut read = 0;
let mut write = 0;
for c in char::decode_utf16(
once(((self.top[0] as u16) << 8) | self.top[1] as u16).chain(
src.chunks_exact(2)
.map(|v| ((v[0] as u16) << 8) | v[1] as u16),
),
) {
if let Ok(c) = c {
read += c.len_utf16() * 2;
write += c.len_utf8();
dst.push(c);
} else {
let rem = src.len() - (read - 2);
if !finish && rem < 4 {
break;
} else {
return Err(DecodeError::Malformed {
read: read + 2,
write,
length: 2,
offset: 0,
});
}
}
if dst.capacity() - dst.len() < 4 {
break;
}
}
return if read > 0 {
self.top = [0xFE, 0xFF];
read -= 2 - base;
Ok((read, write))
} else {
Ok((base, 0))
};
}
if self.be {
UTF16BEDecoder.decode(src, dst, finish)
} else {
UTF16LEDecoder.decode(src, dst, finish)
}
}
}
pub const UTF16BE_NAME: &str = "UTF-16BE";
pub struct UTF16BEEncoder;
impl Encoder for UTF16BEEncoder {
fn name(&self) -> &'static str {
UTF16BE_NAME
}
fn encode(
&mut self,
src: &str,
mut dst: &mut [u8],
_finish: bool,
) -> Result<(usize, usize), EncodeError> {
if src.is_empty() {
return Err(EncodeError::InputIsEmpty);
}
if dst.len() < 4 {
return Err(EncodeError::OutputTooShort);
}
let mut buf = [0u16; 2];
let mut read = 0;
let mut write = 0;
for c in src.chars() {
read += c.len_utf8();
let b = c.encode_utf16(&mut buf);
dst[..2].copy_from_slice(&b[0].to_be_bytes());
dst = &mut dst[2..];
write += 2;
if b.len() == 2 {
dst[..2].copy_from_slice(&b[1].to_be_bytes());
dst = &mut dst[2..];
write += 2;
}
if dst.len() < 4 {
break;
}
}
Ok((read, write))
}
}
pub struct UTF16BEDecoder;
impl Decoder for UTF16BEDecoder {
fn name(&self) -> &'static str {
UTF16BE_NAME
}
fn decode(
&mut self,
src: &[u8],
dst: &mut String,
finish: bool,
) -> Result<(usize, usize), DecodeError> {
if src.is_empty() {
return Err(DecodeError::InputIsEmpty);
}
let cap = dst.capacity() - dst.len();
if cap < 4 {
return Err(DecodeError::OutputTooShort);
}
let mut read = 0;
let mut write = 0;
for c in char::decode_utf16(
src.chunks_exact(2)
.map(|v| u16::from_be_bytes([v[0], v[1]])),
) {
if let Ok(c) = c {
read += c.len_utf16() * 2;
write += c.len_utf8();
dst.push(c);
} else {
let rem = src.len() - read;
if !finish && rem < 4 {
break;
} else {
return Err(DecodeError::Malformed {
read: read + 2,
write,
length: 2,
offset: 0,
});
}
}
if dst.capacity() - dst.len() < 4 {
break;
}
}
Ok((read, write))
}
}
pub const UTF16LE_NAME: &str = "UTF-16LE";
pub struct UTF16LEEncoder;
impl Encoder for UTF16LEEncoder {
fn name(&self) -> &'static str {
UTF16LE_NAME
}
fn encode(
&mut self,
src: &str,
mut dst: &mut [u8],
_finish: bool,
) -> Result<(usize, usize), EncodeError> {
if src.is_empty() {
return Err(EncodeError::InputIsEmpty);
}
if dst.len() < 4 {
return Err(EncodeError::OutputTooShort);
}
let mut buf = [0u16; 2];
let mut read = 0;
let mut write = 0;
for c in src.chars() {
read += c.len_utf8();
let b = c.encode_utf16(&mut buf);
dst[..2].copy_from_slice(&b[0].to_le_bytes());
dst = &mut dst[2..];
write += 2;
if b.len() == 2 {
dst[..2].copy_from_slice(&b[1].to_le_bytes());
dst = &mut dst[2..];
write += 2;
}
if dst.len() < 4 {
break;
}
}
Ok((read, write))
}
}
pub struct UTF16LEDecoder;
impl Decoder for UTF16LEDecoder {
fn name(&self) -> &'static str {
UTF16LE_NAME
}
fn decode(
&mut self,
src: &[u8],
dst: &mut String,
finish: bool,
) -> Result<(usize, usize), DecodeError> {
if src.is_empty() {
return Err(DecodeError::InputIsEmpty);
}
let cap = dst.capacity() - dst.len();
if cap < 4 {
return Err(DecodeError::OutputTooShort);
}
let mut read = 0;
let mut write = 0;
for c in char::decode_utf16(
src.chunks_exact(2)
.map(|v| u16::from_le_bytes([v[0], v[1]])),
) {
if let Ok(c) = c {
read += c.len_utf16() * 2;
write += c.len_utf8();
dst.push(c);
} else {
let rem = src.len() - read;
if !finish && rem < 4 {
break;
} else {
return Err(DecodeError::Malformed {
read: read + 2,
write,
length: 2,
offset: 0,
});
}
}
if dst.capacity() - dst.len() < 4 {
break;
}
}
Ok((read, write))
}
}
pub const DEFAULT_SUPPORTED_ENCODINGS: &[&str] =
&[UTF16_NAME, UTF16BE_NAME, UTF16LE_NAME, UTF8_NAME];
pub static ENCODING_ALIASES: LazyLock<RwLock<BTreeMap<&'static str, &'static str>>> =
LazyLock::new(|| {
RwLock::new(BTreeMap::from([
("UTF8", UTF8_NAME),
("UTF16", UTF16_NAME),
("UTF16BE", UTF16BE_NAME),
("UTF16LE", UTF16LE_NAME),
]))
});
pub fn register_encoding_alias(alias: &'static str, real: &'static str) -> Option<&'static str> {
ENCODING_ALIASES.write().unwrap().insert(alias, real)
}
pub fn unregister_encoding_alias(alias: &'static str) -> Option<&'static str> {
ENCODING_ALIASES.write().unwrap().remove(alias)
}
pub type EncoderFactory = fn() -> Box<dyn Encoder>;
pub static ENCODER_TABLE: LazyLock<RwLock<BTreeMap<&'static str, EncoderFactory>>> =
LazyLock::new(|| {
let mut map = BTreeMap::<&'static str, EncoderFactory>::new();
map.insert(UTF8_NAME, || Box::new(UTF8Encoder));
map.insert(UTF16_NAME, || Box::new(UTF16Encoder { init: false }));
map.insert(UTF16BE_NAME, || Box::new(UTF16BEEncoder));
map.insert(UTF16LE_NAME, || Box::new(UTF16LEEncoder));
RwLock::new(map)
});
pub fn find_encoder(encoding_name: &str) -> Option<Box<dyn Encoder>> {
let table = ENCODER_TABLE.read().unwrap();
if let Some(factory) = table.get(encoding_name) {
return Some(factory());
}
let &alias = ENCODING_ALIASES.read().unwrap().get(encoding_name)?;
table.get(alias).map(|f| f())
}
pub fn register_encoder(
encoding_name: &'static str,
factory: EncoderFactory,
) -> Option<EncoderFactory> {
ENCODER_TABLE
.write()
.unwrap()
.insert(encoding_name, factory)
}
pub fn unregister_encoder(encoding_name: &str) -> Option<EncoderFactory> {
ENCODER_TABLE.write().unwrap().remove(encoding_name)
}
pub type DecoderFactory = fn() -> Box<dyn Decoder>;
pub static DECODER_TABLE: LazyLock<RwLock<BTreeMap<&'static str, DecoderFactory>>> =
LazyLock::new(|| {
let mut map = BTreeMap::<&'static str, DecoderFactory>::new();
map.insert(UTF8_NAME, || Box::new(UTF8Decoder));
map.insert(UTF16_NAME, || {
Box::new(UTF16Decoder {
read: 0,
top: [0; 2],
be: true,
})
});
map.insert(UTF16BE_NAME, || Box::new(UTF16BEDecoder));
map.insert(UTF16LE_NAME, || Box::new(UTF16LEDecoder));
RwLock::new(map)
});
pub fn find_decoder(encoding_name: &str) -> Option<Box<dyn Decoder>> {
let table = DECODER_TABLE.read().unwrap();
if let Some(factory) = table.get(encoding_name) {
return Some(factory());
}
let &alias = ENCODING_ALIASES.read().unwrap().get(encoding_name)?;
table.get(alias).map(|f| f())
}
pub fn register_decoder(
encoding_name: &'static str,
factory: DecoderFactory,
) -> Option<DecoderFactory> {
DECODER_TABLE
.write()
.unwrap()
.insert(encoding_name, factory)
}
pub fn unregister_decoder(encoding_name: &str) -> Option<DecoderFactory> {
DECODER_TABLE.write().unwrap().remove(encoding_name)
}