#![allow(unsafe_code)]
use std::fmt::Debug;
#[derive(Debug,Clone,Copy)]
pub struct InvalidChar();
pub trait Decoder {
type Word : Default + Copy + Debug;
const MAX_CODEPOINT_LEN: usize;
fn decode(words:&[Self::Word]) -> Char<InvalidChar>;
}
#[derive(Debug,Clone,Copy,PartialEq)]
pub struct Char<Error> {
pub char: Result<char,Error>,
pub size: usize,
}
impl Char<crate::Error> {
pub fn is_eof(&self) -> bool {
match self.char {
Ok(_) => false,
Err(crate::Error::EOF) => true,
Err(_) => false
}
}
}
#[derive(Debug,Copy,Clone)]
pub struct DecoderUTF8();
impl Decoder for DecoderUTF8 {
type Word = u8;
const MAX_CODEPOINT_LEN: usize = 4;
fn decode(words: &[u8]) -> Char<InvalidChar> {
let size = match !words[0] >> 4 {
0 => 4,
1 => 3,
2 | 3 => 2,
_ => 1,
};
let mut char = (words[0] << size >> size) as u32;
for word in &words[1..size] {
char = char << 6 | (word & 0b_0011_1111) as u32;
}
Char{char:std::char::from_u32(char).ok_or_else(InvalidChar),size}
}
}
#[derive(Debug,Copy,Clone)]
pub struct DecoderUTF16();
impl Decoder for DecoderUTF16 {
type Word = u16;
const MAX_CODEPOINT_LEN: usize = 2;
fn decode(words: &[u16]) -> Char<InvalidChar> {
if words[0] < 0xD800 || 0xDFFF < words[0] {
let char = Ok(unsafe{std::char::from_u32_unchecked(words[0] as u32)});
return Char{char,size:1};
}
let char = (((words[0] - 0xD800) as u32) << 10 | (words[1] - 0xDC00) as u32) + 0x1_0000;
Char{char:std::char::from_u32(char).ok_or_else(InvalidChar), size:2}
}
}
#[derive(Debug,Copy,Clone)]
pub struct DecoderUTF32();
impl Decoder for DecoderUTF32 {
type Word = char;
const MAX_CODEPOINT_LEN: usize = 1;
fn decode(words: &[char]) -> Char<InvalidChar> {
Char{char:Ok(words[0]), size:1}
}
}
#[cfg(test)]
mod tests {
use super::*;
use itertools::Itertools;
#[test]
fn test_utf8() {
let string = "a.b^c! #𤭢界んにち𤭢#𤭢";
let mut buf = string.as_bytes();
let mut str = String::from("");
while !buf.is_empty() {
let char = DecoderUTF8::decode(buf);
str.push(char.char.unwrap());
buf = &buf[char.size..];
}
assert_eq!(str, string);
}
#[test]
fn test_utf16() {
let string = "a.b^c! #𤭢界んにち𤭢#𤭢";
let buffer = string.encode_utf16().collect_vec();
let mut buf = &buffer[..];
let mut str = String::from("");
while !buf.is_empty() {
let char = DecoderUTF16::decode(buf);
str.push(char.char.unwrap());
buf = &buf[char.size..];
}
assert_eq!(str, string);
}
#[test]
fn test_utf32() {
let string = "a.b^c! #𤭢界んにち𤭢#𤭢".chars().collect_vec();
let mut buf = &string[..];
let mut str = vec![];
while !buf.is_empty() {
let char = DecoderUTF32::decode(buf);
str.push(char.char.unwrap());
buf = &buf[char.size..];
}
assert_eq!(str, string);
}
}