#![cfg_attr(test, feature(test))]
#[macro_use]
extern crate debug_unreachable;
#[macro_use]
extern crate mac;
#[cfg(test)]
extern crate test as std_test;
use std::{slice, char};
#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Debug, Hash)]
pub enum Meaning {
Whole(char),
LeadSurrogate(u16),
TrailSurrogate(u16),
Prefix(usize),
Suffix,
}
#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Debug, Hash)]
pub struct Codepoint<'a> {
pub bytes: &'a [u8],
pub rewind: usize,
pub meaning: Meaning,
}
#[derive(Debug, PartialEq, Eq)]
enum Byte {
Ascii,
Start(usize),
Cont,
}
impl Byte {
#[inline(always)]
fn classify(x: u8) -> Option<Byte> {
match x & 0xC0 {
0xC0 => match x {
x if x & 0b11111_000 == 0b11110_000 => Some(Byte::Start(4)),
x if x & 0b1111_0000 == 0b1110_0000 => Some(Byte::Start(3)),
x if x & 0b111_00000 == 0b110_00000 => Some(Byte::Start(2)),
_ => None,
},
0x80 => Some(Byte::Cont),
_ => Some(Byte::Ascii),
}
}
}
#[inline(always)]
fn all_cont(buf: &[u8]) -> bool {
buf.iter().all(|&b| matches!(Byte::classify(b), Some(Byte::Cont)))
}
#[inline(always)]
unsafe fn decode(buf: &[u8]) -> Option<Meaning> {
debug_assert!(buf.len() >= 2);
debug_assert!(buf.len() <= 4);
let n;
match buf.len() {
2 => {
n = ((*buf.get_unchecked(0) & 0b11111) as u32) << 6
| ((*buf.get_unchecked(1) & 0x3F) as u32);
if n < 0x80 { return None } }
3 => {
n = ((*buf.get_unchecked(0) & 0b1111) as u32) << 12
| ((*buf.get_unchecked(1) & 0x3F) as u32) << 6
| ((*buf.get_unchecked(2) & 0x3F) as u32);
match n {
0x0000 ... 0x07FF => return None, 0xD800 ... 0xDBFF => return Some(Meaning::LeadSurrogate(n as u16 - 0xD800)),
0xDC00 ... 0xDFFF => return Some(Meaning::TrailSurrogate(n as u16 - 0xDC00)),
_ => {}
}
}
4 => {
n = ((*buf.get_unchecked(0) & 0b111) as u32) << 18
| ((*buf.get_unchecked(1) & 0x3F) as u32) << 12
| ((*buf.get_unchecked(2) & 0x3F) as u32) << 6
| ((*buf.get_unchecked(3) & 0x3F) as u32);
if n < 0x1_0000 { return None } }
_ => debug_unreachable!(),
}
char::from_u32(n).map(Meaning::Whole)
}
#[inline(always)]
unsafe fn unsafe_slice<'a>(buf: &'a [u8], start: usize, new_len: usize) -> &'a [u8] {
debug_assert!(start <= buf.len());
debug_assert!(new_len <= (buf.len() - start));
slice::from_raw_parts(buf.as_ptr().offset(start as isize), new_len)
}
macro_rules! otry {
($x:expr) => { unwrap_or_return!($x, None) }
}
#[inline]
pub fn classify<'a>(buf: &'a [u8], idx: usize) -> Option<Codepoint<'a>> {
if idx >= buf.len() {
return None;
}
unsafe {
let x = *buf.get_unchecked(idx);
match otry!(Byte::classify(x)) {
Byte::Ascii => Some(Codepoint {
bytes: unsafe_slice(buf, idx, 1),
rewind: 0,
meaning: Meaning::Whole(x as char),
}),
Byte::Start(n) => {
let avail = buf.len() - idx;
if avail >= n {
let bytes = unsafe_slice(buf, idx, n);
if !all_cont(unsafe_slice(bytes, 1, n-1)) {
return None;
}
let meaning = otry!(decode(bytes));
Some(Codepoint {
bytes: bytes,
rewind: 0,
meaning: meaning,
})
} else {
Some(Codepoint {
bytes: unsafe_slice(buf, idx, avail),
rewind: 0,
meaning: Meaning::Prefix(n - avail),
})
}
},
Byte::Cont => {
let mut start = idx;
let mut checked = 0;
loop {
if start == 0 {
return Some(Codepoint {
bytes: unsafe_slice(buf, 0, idx + 1),
rewind: idx,
meaning: Meaning::Suffix,
});
}
start -= 1;
checked += 1;
match otry!(Byte::classify(*buf.get_unchecked(start))) {
Byte::Cont => (),
Byte::Start(n) => {
let avail = buf.len() - start;
if avail >= n {
let bytes = unsafe_slice(buf, start, n);
if checked < n {
if !all_cont(unsafe_slice(bytes, checked, n-checked)) {
return None;
}
}
let meaning = otry!(decode(bytes));
return Some(Codepoint {
bytes: bytes,
rewind: idx - start,
meaning: meaning,
});
} else {
return Some(Codepoint {
bytes: unsafe_slice(buf, start, avail),
rewind: idx - start,
meaning: Meaning::Prefix(n - avail),
});
}
}
_ => return None,
}
if idx - start >= 3 {
return None;
}
}
}
}
}
}
#[cfg(test)]
mod test;