#[macro_use] extern crate matches;
use std::ops::Deref;
use std::result;
use std::str;
pub const REPLACEMENT_CHARACTER: &'static str = "\u{FFFD}";
pub struct Decoder {
incomplete_sequence: IncompleteSequence,
}
struct IncompleteSequence {
len: u8,
first: u8,
second: u8,
third: u8,
}
impl Decoder {
#[inline]
pub fn new() -> Decoder {
Decoder {
incomplete_sequence: IncompleteSequence {
len: 0,
first: 0,
second: 0,
third: 0,
}
}
}
#[inline]
pub fn has_incomplete_sequence(&self) -> bool {
self.incomplete_sequence.len > 0
}
pub fn decode<'a>(&mut self, input_chunk: &'a [u8]) -> (InlineString, &'a str, Result<'a>) {
let (ch, input) = match self.incomplete_sequence.complete(input_chunk) {
Ok(tuple) => tuple,
Err(result) => return (InlineString::empty(), "", result)
};
let mut position = 0;
loop {
let first = match input.get(position) {
Some(&b) => b,
None => return (
ch,
unsafe {
str::from_utf8_unchecked(input)
},
Result::Ok,
)
};
if first < 128 {
position += 1
} else {
macro_rules! valid_prefix {
() => {
unsafe {
str::from_utf8_unchecked(&input[..position])
}
}
}
macro_rules! next {
($current_sequence_len: expr, $first: expr, $second: expr, $third: expr) => {
match input.get(position + $current_sequence_len) {
Some(&b) => b,
None => {
self.incomplete_sequence = IncompleteSequence {
len: $current_sequence_len,
first: $first,
second: $second,
third: $third,
};
return (ch, valid_prefix!(), Result::Incomplete)
}
}
}
}
macro_rules! check {
($valid: expr, $current_sequence_len: expr) => {
if !$valid {
return (
ch,
valid_prefix!(),
Result::Error {
remaining_input_after_error:
&input[position + $current_sequence_len..]
}
)
}
}
}
let width = UTF8_CHAR_WIDTH[first as usize];
check!(width != 0, 1);
let second = next!(1, first, 0, 0);
let valid = match width {
2 => is_continuation_byte(second),
3 => valid_three_bytes_sequence_prefix(first, second),
_ => {
debug_assert!(width == 4);
valid_four_bytes_sequence_prefix(first, second)
}
};
check!(valid, 1);
if width > 2 {
let third = next!(2, first, second, 0);
check!(is_continuation_byte(third), 2);
if width > 3 {
let fourth = next!(3, first, second, third);
check!(is_continuation_byte(fourth), 3);
}
}
position += width as usize;
}
}
}
}
#[derive(Debug, Copy, Clone)]
pub enum Result<'a> {
Ok,
Incomplete,
Error { remaining_input_after_error: &'a [u8] },
}
impl IncompleteSequence {
fn complete<'a>(&mut self, input: &'a [u8])
-> result::Result<(InlineString, &'a [u8]), Result<'a>> {
if self.len == 0 {
return Ok((InlineString::empty(), input))
}
let width = width(self.first);
debug_assert!(0 < self.len && self.len < width && width <= 4);
let mut position = 0;
macro_rules! next {
() => {
match input.get(position) {
Some(&b) => b,
None => {
let new_len = self.len + position as u8;
debug_assert!(new_len < 4);
self.len = new_len;
return Err(Result::Incomplete)
}
}
}
}
macro_rules! check {
($valid: expr) => {
if !$valid {
self.len = 0;
return Err(Result::Error { remaining_input_after_error: &input[position..] })
}
}
}
if self.len < 2 {
self.second = next!();
let valid = match width {
2 => is_continuation_byte(self.second),
3 => valid_three_bytes_sequence_prefix(self.first, self.second),
_ => {
debug_assert!(width == 4);
valid_four_bytes_sequence_prefix(self.first, self.second)
}
};
check!(valid);
position += 1;
}
let mut fourth = 0;
if width > 2 {
if self.len < 3 {
self.third = next!();
check!(is_continuation_byte(self.third));
position += 1;
}
if width > 3 {
fourth = next!();
check!(is_continuation_byte(fourth));
position += 1;
}
}
let ch = InlineString {
buffer: [self.first, self.second, self.third, fourth],
len: width,
};
self.len = 0;
Ok((ch, &input[position..]))
}
}
#[inline]
fn width(first_byte: u8) -> u8 {
UTF8_CHAR_WIDTH[first_byte as usize]
}
const UTF8_CHAR_WIDTH: &'static [u8; 256] = &[
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, ];
#[inline]
fn is_continuation_byte(b: u8) -> bool {
const CONTINUATION_MASK: u8 = 0b1100_0000;
const CONTINUATION_TAG: u8 = 0b1000_0000;
b & CONTINUATION_MASK == CONTINUATION_TAG
}
#[inline]
fn valid_three_bytes_sequence_prefix(first: u8, second: u8) -> bool {
matches!((first, second),
(0xE0 , 0xA0 ... 0xBF) |
(0xE1 ... 0xEC, 0x80 ... 0xBF) |
(0xED , 0x80 ... 0x9F) |
(0xEE ... 0xEF, 0x80 ... 0xBF)
)
}
#[inline]
fn valid_four_bytes_sequence_prefix(first: u8, second: u8) -> bool {
matches!((first, second),
(0xF0 , 0x90 ... 0xBF) |
(0xF1 ... 0xF3, 0x80 ... 0xBF) |
(0xF4 , 0x80 ... 0x8F)
)
}
pub struct LossyDecoder<F: FnMut(&str)> {
push_str: F,
decoder: Decoder,
}
impl<F: FnMut(&str)> LossyDecoder<F> {
#[inline]
pub fn new(push_str: F) -> Self {
LossyDecoder {
push_str: push_str,
decoder: Decoder::new(),
}
}
pub fn feed(&mut self, mut input: &[u8]) {
loop {
let (ch, s, result) = self.decoder.decode(input);
if !ch.is_empty() {
(self.push_str)(&ch);
}
if !s.is_empty() {
(self.push_str)(s);
}
match result {
Result::Ok | Result::Incomplete => break,
Result::Error { remaining_input_after_error: remaining } => {
(self.push_str)(REPLACEMENT_CHARACTER);
input = remaining;
}
}
}
}
}
impl<F: FnMut(&str)> Drop for LossyDecoder<F> {
#[inline]
fn drop(&mut self) {
if self.decoder.has_incomplete_sequence() {
(self.push_str)(REPLACEMENT_CHARACTER)
}
}
}
#[derive(Copy, Clone)]
pub struct InlineString {
buffer: [u8; 4],
len: u8,
}
impl Deref for InlineString {
type Target = str;
#[inline]
fn deref(&self) -> &str {
unsafe {
str::from_utf8_unchecked(&self.buffer[..self.len as usize])
}
}
}
impl InlineString {
fn empty() -> InlineString {
InlineString {
buffer: [0, 0, 0, 0],
len: 0,
}
}
pub fn len(&self) -> usize {
self.len as usize
}
pub fn is_empty(&self) -> bool {
self.len == 0
}
}