pub trait Decoder: Iterator<Item = char> {
fn next_n(&mut self, n: usize) -> Vec<char>;
fn fill_n(&mut self, n: usize, target: &mut [char]) -> usize;
}
#[inline]
fn check(cp: u32) -> char {
std::char::from_u32(cp).unwrap_or(std::char::REPLACEMENT_CHARACTER)
}
#[derive(Eq, PartialEq, Debug, Copy, Clone)]
pub enum Encoding {
UTF8,
UTF16LE,
UTF16BE,
}
#[rustfmt::skip]
const UTF8D: [u8; 400] = [
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, 0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, 0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, 1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, 1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, 1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, ];
const UTF8_ACCEPT_STATE: u8 = 0;
const UTF8_REJECT_STATE: u8 = 1;
#[derive(Debug)]
pub struct Decode {
buffer: Vec<u8>,
size: usize,
next: usize,
encoding: Encoding,
}
impl Decode {
pub fn new(bytes: Vec<u8>) -> Self {
let length = bytes.len();
let mut decode = Decode {
buffer: bytes,
size: length,
next: 0,
encoding: Encoding::UTF8,
};
decode.guess_encoding();
decode
}
pub fn from_string(string: &str) -> Self {
Decode::new(string.bytes().collect())
}
fn guess_encoding(&mut self) {
if self.size >= 3 && self.buffer[0..3] == [0xEF, 0xBB, 0xBF] {
self.encoding = Encoding::UTF8;
self.next = 3;
} else if self.size >= 2 && self.buffer[0..2] == [0xFE, 0xFF] {
self.encoding = Encoding::UTF16BE;
self.next = 2;
} else if self.size >= 2 && self.buffer[0..2] == [0xFF, 0xFE] {
self.encoding = Encoding::UTF16LE;
self.next = 2;
} else {
#[cfg(windows)]
{
self.encoding = Encoding::UTF16LE;
}
#[cfg(not(windows))]
{
self.encoding = Encoding::UTF8;
}
}
}
fn next_utf8(&mut self) -> Option<char> {
let mut state: u8 = UTF8_ACCEPT_STATE;
let mut codep: u32 = 0;
loop {
if self.next >= self.size {
return None;
}
let byte = self.buffer[self.next];
self.next += 1;
let kind = UTF8D[byte as usize];
codep = if state != UTF8_ACCEPT_STATE {
(byte as u32 & 0x3Fu32) | (codep << 6)
} else {
(0xFFu32 >> kind) & (byte as u32)
};
state = UTF8D[256 + (state as usize) * 16 + (kind as usize)];
if state == UTF8_ACCEPT_STATE {
return Some(std::char::from_u32(codep).unwrap());
} else if state == UTF8_REJECT_STATE {
return Some(std::char::REPLACEMENT_CHARACTER);
}
} }
fn next_utf16_surrogate(&mut self) -> Option<u16> {
if self.next + 2 > self.size {
return None;
}
let first = self.buffer[self.next];
self.next += 1;
let second = self.buffer[self.next];
self.next += 1;
if self.encoding == Encoding::UTF16BE {
Some(((first as u16) << 8) | second as u16)
} else {
Some(((second as u16) << 8) | first as u16)
}
}
fn next_utf16(&mut self) -> Option<char> {
let surrogate1 = self.next_utf16_surrogate()?;
if !(0xD800..0xE000).contains(&surrogate1) {
Some(check(surrogate1 as u32))
} else {
let surrogate2 = match self.next_utf16_surrogate() {
None => return Some(std::char::REPLACEMENT_CHARACTER),
Some(value) => value,
};
let (high, low) = (surrogate1 as u32, surrogate2 as u32);
if high < 0xD800 || low < 0xDC00 {
Some(std::char::REPLACEMENT_CHARACTER)
} else {
Some(check((high - 0xD800) * 0x400 + (low - 0xDC00) + 0x10000))
}
}
}
}
impl Decoder for Decode {
fn next_n(&mut self, n: usize) -> Vec<char> {
let mut result = vec![];
match &self.encoding {
Encoding::UTF8 => {
for _ in 0..n {
if let Some(ch) = self.next_utf8() {
result.push(ch);
}
}
}
Encoding::UTF16LE | &Encoding::UTF16BE => {
for _ in 0..n {
if let Some(ch) = self.next_utf16() {
result.push(ch);
}
}
}
}
result
}
fn fill_n(&mut self, n: usize, target: &mut [char]) -> usize {
let mut count = n;
match &self.encoding {
Encoding::UTF8 => {
#[allow(clippy::needless_range_loop)]
for index in 0..n {
match self.next_utf8() {
Some(ch) => target[index] = ch,
None => {
count = index;
break;
}
}
}
count
}
Encoding::UTF16LE | &Encoding::UTF16BE => {
#[allow(clippy::needless_range_loop)]
for index in 0..n {
match self.next_utf16() {
Some(ch) => target[index] = ch,
None => {
count = index;
break;
}
}
}
count
}
}
}
}
impl Iterator for Decode {
type Item = char;
fn next(&mut self) -> Option<char> {
match &self.encoding {
Encoding::UTF8 => self.next_utf8(),
Encoding::UTF16LE => self.next_utf16(),
Encoding::UTF16BE => self.next_utf16(),
}
}
}