use core::char;
use core::cmp;
use core::fmt;
#[cfg(feature = "std")]
use std::error;
use ascii;
const ACCEPT: usize = 12;
const REJECT: usize = 0;
#[cfg_attr(rustfmt, rustfmt::skip)]
static CLASSES: [u8; 256] = [
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
];
#[cfg_attr(rustfmt, rustfmt::skip)]
static STATES_FORWARD: &'static [u8] = &[
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
12, 0, 24, 36, 60, 96, 84, 0, 0, 0, 48, 72,
0, 12, 0, 0, 0, 0, 0, 12, 0, 12, 0, 0,
0, 24, 0, 0, 0, 0, 0, 24, 0, 24, 0, 0,
0, 0, 0, 0, 0, 0, 0, 24, 0, 0, 0, 0,
0, 24, 0, 0, 0, 0, 0, 0, 0, 24, 0, 0,
0, 0, 0, 0, 0, 0, 0, 36, 0, 36, 0, 0,
0, 36, 0, 0, 0, 0, 0, 36, 0, 36, 0, 0,
0, 36, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
];
#[derive(Clone, Debug)]
pub struct Chars<'a> {
bs: &'a [u8],
}
impl<'a> Chars<'a> {
pub(crate) fn new(bs: &'a [u8]) -> Chars<'a> {
Chars { bs }
}
#[inline]
pub fn as_bytes(&self) -> &'a [u8] {
self.bs
}
}
impl<'a> Iterator for Chars<'a> {
type Item = char;
#[inline]
fn next(&mut self) -> Option<char> {
let (ch, size) = decode_lossy(self.bs);
if size == 0 {
return None;
}
self.bs = &self.bs[size..];
Some(ch)
}
}
impl<'a> DoubleEndedIterator for Chars<'a> {
#[inline]
fn next_back(&mut self) -> Option<char> {
let (ch, size) = decode_last_lossy(self.bs);
if size == 0 {
return None;
}
self.bs = &self.bs[..self.bs.len() - size];
Some(ch)
}
}
#[derive(Clone, Debug)]
pub struct CharIndices<'a> {
bs: &'a [u8],
forward_index: usize,
reverse_index: usize,
}
impl<'a> CharIndices<'a> {
pub(crate) fn new(bs: &'a [u8]) -> CharIndices<'a> {
CharIndices { bs: bs, forward_index: 0, reverse_index: bs.len() }
}
#[inline]
pub fn as_bytes(&self) -> &'a [u8] {
self.bs
}
}
impl<'a> Iterator for CharIndices<'a> {
type Item = (usize, usize, char);
#[inline]
fn next(&mut self) -> Option<(usize, usize, char)> {
let index = self.forward_index;
let (ch, size) = decode_lossy(self.bs);
if size == 0 {
return None;
}
self.bs = &self.bs[size..];
self.forward_index += size;
Some((index, index + size, ch))
}
}
impl<'a> DoubleEndedIterator for CharIndices<'a> {
#[inline]
fn next_back(&mut self) -> Option<(usize, usize, char)> {
let (ch, size) = decode_last_lossy(self.bs);
if size == 0 {
return None;
}
self.bs = &self.bs[..self.bs.len() - size];
self.reverse_index -= size;
Some((self.reverse_index, self.reverse_index + size, ch))
}
}
#[derive(Debug, Eq, PartialEq)]
pub struct Utf8Error {
valid_up_to: usize,
error_len: Option<usize>,
}
impl Utf8Error {
#[inline]
pub fn valid_up_to(&self) -> usize {
self.valid_up_to
}
#[inline]
pub fn error_len(&self) -> Option<usize> {
self.error_len
}
}
#[cfg(feature = "std")]
impl error::Error for Utf8Error {
fn description(&self) -> &str {
"invalid UTF-8"
}
}
impl fmt::Display for Utf8Error {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "invalid UTF-8 found at byte offset {}", self.valid_up_to)
}
}
pub fn validate(slice: &[u8]) -> Result<(), Utf8Error> {
fn fast(slice: &[u8]) -> Result<(), Utf8Error> {
let mut state = ACCEPT;
let mut i = 0;
while i < slice.len() {
let b = slice[i];
if state == ACCEPT
&& b <= 0x7F
&& slice.get(i + 1).map_or(false, |&b| b <= 0x7F)
{
i += ascii::first_non_ascii_byte(&slice[i..]);
continue;
}
state = step(state, b);
if state == REJECT {
return Err(find_valid_up_to(slice, i));
}
i += 1;
}
if state != ACCEPT {
Err(find_valid_up_to(slice, slice.len()))
} else {
Ok(())
}
}
#[inline(never)]
fn find_valid_up_to(slice: &[u8], rejected_at: usize) -> Utf8Error {
let mut backup = rejected_at.saturating_sub(1);
while backup > 0 && !is_leading_utf8_byte(slice[backup]) {
backup -= 1;
}
let upto = cmp::min(slice.len(), rejected_at.saturating_add(1));
let mut err = slow(&slice[backup..upto]).unwrap_err();
err.valid_up_to += backup;
err
}
fn slow(slice: &[u8]) -> Result<(), Utf8Error> {
let mut state = ACCEPT;
let mut valid_up_to = 0;
for (i, &b) in slice.iter().enumerate() {
state = step(state, b);
if state == ACCEPT {
valid_up_to = i + 1;
} else if state == REJECT {
let error_len = Some(cmp::max(1, i - valid_up_to));
return Err(Utf8Error { valid_up_to, error_len });
}
}
if state != ACCEPT {
Err(Utf8Error { valid_up_to, error_len: None })
} else {
Ok(())
}
}
fn step(state: usize, b: u8) -> usize {
let class = CLASSES[b as usize];
unsafe {
*STATES_FORWARD.get_unchecked(state + class as usize) as usize
}
}
fast(slice)
}
#[inline]
pub fn decode<B: AsRef<[u8]>>(slice: B) -> (Option<char>, usize) {
let slice = slice.as_ref();
match slice.get(0) {
None => return (None, 0),
Some(&b) if b <= 0x7F => return (Some(b as char), 1),
_ => {}
}
let (mut state, mut cp, mut i) = (ACCEPT, 0, 0);
while i < slice.len() {
decode_step(&mut state, &mut cp, slice[i]);
i += 1;
if state == ACCEPT {
let ch = unsafe { char::from_u32_unchecked(cp) };
return (Some(ch), i);
} else if state == REJECT {
return (None, cmp::max(1, i.saturating_sub(1)));
}
}
(None, i)
}
#[inline]
pub fn decode_lossy<B: AsRef<[u8]>>(slice: B) -> (char, usize) {
match decode(slice) {
(Some(ch), size) => (ch, size),
(None, size) => ('\u{FFFD}', size),
}
}
#[inline]
pub fn decode_last<B: AsRef<[u8]>>(slice: B) -> (Option<char>, usize) {
let slice = slice.as_ref();
if slice.is_empty() {
return (None, 0);
}
let mut start = slice.len() - 1;
let limit = slice.len().saturating_sub(4);
while start > limit && !is_leading_utf8_byte(slice[start]) {
start -= 1;
}
let (ch, size) = decode(&slice[start..]);
if start + size != slice.len() {
(None, 1)
} else {
(ch, size)
}
}
#[inline]
pub fn decode_last_lossy<B: AsRef<[u8]>>(slice: B) -> (char, usize) {
match decode_last(slice) {
(Some(ch), size) => (ch, size),
(None, size) => ('\u{FFFD}', size),
}
}
#[inline]
pub fn decode_step(state: &mut usize, cp: &mut u32, b: u8) {
let class = CLASSES[b as usize];
if *state == ACCEPT {
*cp = (0xFF >> class) & (b as u32);
} else {
*cp = (b as u32 & 0b111111) | (*cp << 6);
}
*state = STATES_FORWARD[*state + class as usize] as usize;
}
fn is_leading_utf8_byte(b: u8) -> bool {
(b & 0b1100_0000) != 0b1000_0000
}
#[cfg(test)]
mod tests {
use std::char;
use ext_slice::{ByteSlice, B};
use tests::LOSSY_TESTS;
use utf8::{self, Utf8Error};
fn utf8e(valid_up_to: usize) -> Utf8Error {
Utf8Error { valid_up_to, error_len: None }
}
fn utf8e2(valid_up_to: usize, error_len: usize) -> Utf8Error {
Utf8Error { valid_up_to, error_len: Some(error_len) }
}
#[test]
fn validate_all_codepoints() {
for i in 0..(0x10FFFF + 1) {
let cp = match char::from_u32(i) {
None => continue,
Some(cp) => cp,
};
let mut buf = [0; 4];
let s = cp.encode_utf8(&mut buf);
assert_eq!(Ok(()), utf8::validate(s.as_bytes()));
}
}
#[test]
fn validate_multiple_codepoints() {
assert_eq!(Ok(()), utf8::validate(b"abc"));
assert_eq!(Ok(()), utf8::validate(b"a\xE2\x98\x83a"));
assert_eq!(Ok(()), utf8::validate(b"a\xF0\x9D\x9C\xB7a"));
assert_eq!(Ok(()), utf8::validate(b"\xE2\x98\x83\xF0\x9D\x9C\xB7",));
assert_eq!(
Ok(()),
utf8::validate(b"a\xE2\x98\x83a\xF0\x9D\x9C\xB7a",)
);
assert_eq!(
Ok(()),
utf8::validate(b"\xEF\xBF\xBD\xE2\x98\x83\xEF\xBF\xBD",)
);
}
#[test]
fn validate_errors() {
assert_eq!(Err(utf8e2(0, 1)), utf8::validate(b"\xFF"));
assert_eq!(Err(utf8e2(1, 1)), utf8::validate(b"a\xFF"));
assert_eq!(Err(utf8e2(2, 1)), utf8::validate(b"\xCE\xB2\xFF"));
assert_eq!(Err(utf8e2(3, 1)), utf8::validate(b"\xE2\x98\x83\xFF"));
assert_eq!(Err(utf8e2(4, 1)), utf8::validate(b"\xF0\x9D\x9D\xB1\xFF"));
assert_eq!(Err(utf8e2(0, 1)), utf8::validate(b"\xCE\xF0"));
assert_eq!(Err(utf8e2(0, 2)), utf8::validate(b"\xE2\x98\xF0"));
assert_eq!(Err(utf8e2(0, 3)), utf8::validate(b"\xF0\x9D\x9D\xF0"));
assert_eq!(Err(utf8e2(0, 1)), utf8::validate(b"\xF0\x82\x82\xAC"));
assert_eq!(Err(utf8e2(1, 1)), utf8::validate(b"a\xF0\x82\x82\xAC"));
assert_eq!(
Err(utf8e2(3, 1)),
utf8::validate(b"\xE2\x98\x83\xF0\x82\x82\xAC",)
);
assert_eq!(Err(utf8e2(0, 1)), utf8::validate(b"\xED\xA0\x80"));
assert_eq!(Err(utf8e2(1, 1)), utf8::validate(b"a\xED\xA0\x80"));
assert_eq!(
Err(utf8e2(3, 1)),
utf8::validate(b"\xE2\x98\x83\xED\xA0\x80",)
);
assert_eq!(Err(utf8e2(0, 1)), utf8::validate(b"\xCEa"));
assert_eq!(Err(utf8e2(1, 1)), utf8::validate(b"a\xCEa"));
assert_eq!(
Err(utf8e2(3, 1)),
utf8::validate(b"\xE2\x98\x83\xCE\xE2\x98\x83",)
);
assert_eq!(Err(utf8e2(0, 2)), utf8::validate(b"\xE2\x98a"));
assert_eq!(Err(utf8e2(1, 2)), utf8::validate(b"a\xE2\x98a"));
assert_eq!(
Err(utf8e2(3, 2)),
utf8::validate(b"\xE2\x98\x83\xE2\x98\xE2\x98\x83",)
);
assert_eq!(Err(utf8e2(0, 3)), utf8::validate(b"\xF0\x9D\x9Ca"));
assert_eq!(Err(utf8e2(1, 3)), utf8::validate(b"a\xF0\x9D\x9Ca"));
assert_eq!(
Err(utf8e2(4, 3)),
utf8::validate(b"\xF0\x9D\x9C\xB1\xF0\x9D\x9C\xE2\x98\x83",)
);
assert_eq!(
Err(utf8e2(6, 3)),
utf8::validate(b"foobar\xF1\x80\x80quux",)
);
assert_eq!(Err(utf8e(0)), utf8::validate(b"\xCE"));
assert_eq!(Err(utf8e(1)), utf8::validate(b"a\xCE"));
assert_eq!(Err(utf8e(3)), utf8::validate(b"\xE2\x98\x83\xCE"));
assert_eq!(Err(utf8e(0)), utf8::validate(b"\xE2\x98"));
assert_eq!(Err(utf8e(1)), utf8::validate(b"a\xE2\x98"));
assert_eq!(Err(utf8e(3)), utf8::validate(b"\xE2\x98\x83\xE2\x98"));
assert_eq!(Err(utf8e(0)), utf8::validate(b"\xF0\x9D\x9C"));
assert_eq!(Err(utf8e(1)), utf8::validate(b"a\xF0\x9D\x9C"));
assert_eq!(
Err(utf8e(4)),
utf8::validate(b"\xF0\x9D\x9C\xB1\xF0\x9D\x9C",)
);
assert_eq!(
Err(utf8e2(8, 1)),
utf8::validate(b"\xe2\x98\x83\xce\xb2\xe3\x83\x84\xFF",)
);
}
#[test]
fn decode_valid() {
fn d(mut s: &str) -> Vec<char> {
let mut chars = vec![];
while !s.is_empty() {
let (ch, size) = utf8::decode(s.as_bytes());
s = &s[size..];
chars.push(ch.unwrap());
}
chars
}
assert_eq!(vec!['☃'], d("☃"));
assert_eq!(vec!['☃', '☃'], d("☃☃"));
assert_eq!(vec!['α', 'β', 'γ', 'δ', 'ε'], d("αβγδε"));
assert_eq!(vec!['☃', '⛄', '⛇'], d("☃⛄⛇"));
assert_eq!(
vec!['𝗮', '𝗯', '𝗰', '𝗱', '𝗲'],
d("𝗮𝗯𝗰𝗱𝗲")
);
}
#[test]
fn decode_invalid() {
let (ch, size) = utf8::decode(b"");
assert_eq!(None, ch);
assert_eq!(0, size);
let (ch, size) = utf8::decode(b"\xFF");
assert_eq!(None, ch);
assert_eq!(1, size);
let (ch, size) = utf8::decode(b"\xCE\xF0");
assert_eq!(None, ch);
assert_eq!(1, size);
let (ch, size) = utf8::decode(b"\xE2\x98\xF0");
assert_eq!(None, ch);
assert_eq!(2, size);
let (ch, size) = utf8::decode(b"\xF0\x9D\x9D");
assert_eq!(None, ch);
assert_eq!(3, size);
let (ch, size) = utf8::decode(b"\xF0\x9D\x9D\xF0");
assert_eq!(None, ch);
assert_eq!(3, size);
let (ch, size) = utf8::decode(b"\xF0\x82\x82\xAC");
assert_eq!(None, ch);
assert_eq!(1, size);
let (ch, size) = utf8::decode(b"\xED\xA0\x80");
assert_eq!(None, ch);
assert_eq!(1, size);
let (ch, size) = utf8::decode(b"\xCEa");
assert_eq!(None, ch);
assert_eq!(1, size);
let (ch, size) = utf8::decode(b"\xE2\x98a");
assert_eq!(None, ch);
assert_eq!(2, size);
let (ch, size) = utf8::decode(b"\xF0\x9D\x9Ca");
assert_eq!(None, ch);
assert_eq!(3, size);
}
#[test]
fn decode_lossy() {
let (ch, size) = utf8::decode_lossy(b"");
assert_eq!('\u{FFFD}', ch);
assert_eq!(0, size);
let (ch, size) = utf8::decode_lossy(b"\xFF");
assert_eq!('\u{FFFD}', ch);
assert_eq!(1, size);
let (ch, size) = utf8::decode_lossy(b"\xCE\xF0");
assert_eq!('\u{FFFD}', ch);
assert_eq!(1, size);
let (ch, size) = utf8::decode_lossy(b"\xE2\x98\xF0");
assert_eq!('\u{FFFD}', ch);
assert_eq!(2, size);
let (ch, size) = utf8::decode_lossy(b"\xF0\x9D\x9D\xF0");
assert_eq!('\u{FFFD}', ch);
assert_eq!(3, size);
let (ch, size) = utf8::decode_lossy(b"\xF0\x82\x82\xAC");
assert_eq!('\u{FFFD}', ch);
assert_eq!(1, size);
let (ch, size) = utf8::decode_lossy(b"\xED\xA0\x80");
assert_eq!('\u{FFFD}', ch);
assert_eq!(1, size);
let (ch, size) = utf8::decode_lossy(b"\xCEa");
assert_eq!('\u{FFFD}', ch);
assert_eq!(1, size);
let (ch, size) = utf8::decode_lossy(b"\xE2\x98a");
assert_eq!('\u{FFFD}', ch);
assert_eq!(2, size);
let (ch, size) = utf8::decode_lossy(b"\xF0\x9D\x9Ca");
assert_eq!('\u{FFFD}', ch);
assert_eq!(3, size);
}
#[test]
fn decode_last_valid() {
fn d(mut s: &str) -> Vec<char> {
let mut chars = vec![];
while !s.is_empty() {
let (ch, size) = utf8::decode_last(s.as_bytes());
s = &s[..s.len() - size];
chars.push(ch.unwrap());
}
chars
}
assert_eq!(vec!['☃'], d("☃"));
assert_eq!(vec!['☃', '☃'], d("☃☃"));
assert_eq!(vec!['ε', 'δ', 'γ', 'β', 'α'], d("αβγδε"));
assert_eq!(vec!['⛇', '⛄', '☃'], d("☃⛄⛇"));
assert_eq!(
vec!['𝗲', '𝗱', '𝗰', '𝗯', '𝗮'],
d("𝗮𝗯𝗰𝗱𝗲")
);
}
#[test]
fn decode_last_invalid() {
let (ch, size) = utf8::decode_last(b"");
assert_eq!(None, ch);
assert_eq!(0, size);
let (ch, size) = utf8::decode_last(b"\xFF");
assert_eq!(None, ch);
assert_eq!(1, size);
let (ch, size) = utf8::decode_last(b"\xCE\xF0");
assert_eq!(None, ch);
assert_eq!(1, size);
let (ch, size) = utf8::decode_last(b"\xCE");
assert_eq!(None, ch);
assert_eq!(1, size);
let (ch, size) = utf8::decode_last(b"\xE2\x98\xF0");
assert_eq!(None, ch);
assert_eq!(1, size);
let (ch, size) = utf8::decode_last(b"\xE2\x98");
assert_eq!(None, ch);
assert_eq!(2, size);
let (ch, size) = utf8::decode_last(b"\xF0\x9D\x9D\xF0");
assert_eq!(None, ch);
assert_eq!(1, size);
let (ch, size) = utf8::decode_last(b"\xF0\x9D\x9D");
assert_eq!(None, ch);
assert_eq!(3, size);
let (ch, size) = utf8::decode_last(b"\xF0\x82\x82\xAC");
assert_eq!(None, ch);
assert_eq!(1, size);
let (ch, size) = utf8::decode_last(b"\xED\xA0\x80");
assert_eq!(None, ch);
assert_eq!(1, size);
let (ch, size) = utf8::decode_last(b"\xED\xA0");
assert_eq!(None, ch);
assert_eq!(1, size);
let (ch, size) = utf8::decode_last(b"\xED");
assert_eq!(None, ch);
assert_eq!(1, size);
let (ch, size) = utf8::decode_last(b"a\xCE");
assert_eq!(None, ch);
assert_eq!(1, size);
let (ch, size) = utf8::decode_last(b"a\xE2\x98");
assert_eq!(None, ch);
assert_eq!(2, size);
let (ch, size) = utf8::decode_last(b"a\xF0\x9D\x9C");
assert_eq!(None, ch);
assert_eq!(3, size);
}
#[test]
fn decode_last_lossy() {
let (ch, size) = utf8::decode_last_lossy(b"");
assert_eq!('\u{FFFD}', ch);
assert_eq!(0, size);
let (ch, size) = utf8::decode_last_lossy(b"\xFF");
assert_eq!('\u{FFFD}', ch);
assert_eq!(1, size);
let (ch, size) = utf8::decode_last_lossy(b"\xCE\xF0");
assert_eq!('\u{FFFD}', ch);
assert_eq!(1, size);
let (ch, size) = utf8::decode_last_lossy(b"\xCE");
assert_eq!('\u{FFFD}', ch);
assert_eq!(1, size);
let (ch, size) = utf8::decode_last_lossy(b"\xE2\x98\xF0");
assert_eq!('\u{FFFD}', ch);
assert_eq!(1, size);
let (ch, size) = utf8::decode_last_lossy(b"\xE2\x98");
assert_eq!('\u{FFFD}', ch);
assert_eq!(2, size);
let (ch, size) = utf8::decode_last_lossy(b"\xF0\x9D\x9D\xF0");
assert_eq!('\u{FFFD}', ch);
assert_eq!(1, size);
let (ch, size) = utf8::decode_last_lossy(b"\xF0\x9D\x9D");
assert_eq!('\u{FFFD}', ch);
assert_eq!(3, size);
let (ch, size) = utf8::decode_last_lossy(b"\xF0\x82\x82\xAC");
assert_eq!('\u{FFFD}', ch);
assert_eq!(1, size);
let (ch, size) = utf8::decode_last_lossy(b"\xED\xA0\x80");
assert_eq!('\u{FFFD}', ch);
assert_eq!(1, size);
let (ch, size) = utf8::decode_last_lossy(b"\xED\xA0");
assert_eq!('\u{FFFD}', ch);
assert_eq!(1, size);
let (ch, size) = utf8::decode_last_lossy(b"\xED");
assert_eq!('\u{FFFD}', ch);
assert_eq!(1, size);
let (ch, size) = utf8::decode_last_lossy(b"a\xCE");
assert_eq!('\u{FFFD}', ch);
assert_eq!(1, size);
let (ch, size) = utf8::decode_last_lossy(b"a\xE2\x98");
assert_eq!('\u{FFFD}', ch);
assert_eq!(2, size);
let (ch, size) = utf8::decode_last_lossy(b"a\xF0\x9D\x9C");
assert_eq!('\u{FFFD}', ch);
assert_eq!(3, size);
}
#[test]
fn chars() {
for (i, &(expected, input)) in LOSSY_TESTS.iter().enumerate() {
let got: String = B(input).chars().collect();
assert_eq!(
expected, got,
"chars(ith: {:?}, given: {:?})",
i, input,
);
let got: String =
B(input).char_indices().map(|(_, _, ch)| ch).collect();
assert_eq!(
expected, got,
"char_indices(ith: {:?}, given: {:?})",
i, input,
);
let expected: String = expected.chars().rev().collect();
let got: String = B(input).chars().rev().collect();
assert_eq!(
expected, got,
"chars.rev(ith: {:?}, given: {:?})",
i, input,
);
let got: String =
B(input).char_indices().rev().map(|(_, _, ch)| ch).collect();
assert_eq!(
expected, got,
"char_indices.rev(ith: {:?}, given: {:?})",
i, input,
);
}
}
}