use std::{char, str, mem};
use std::default::Default;
use std::io::Write;
use futf::{self, Codepoint, Meaning};
use util::unsafe_slice;
pub mod imp {
use std::{iter, slice, mem};
use std::default::Default;
pub struct Fixup {
pub drop_left: u32,
pub drop_right: u32,
pub insert_len: u32,
pub insert_bytes: [u8; 4],
}
impl Default for Fixup {
#[inline(always)]
fn default() -> Fixup {
Fixup {
drop_left: 0,
drop_right: 0,
insert_len: 0,
insert_bytes: [0; 4],
}
}
}
#[inline(always)]
unsafe fn from_u32_unchecked(n: u32) -> char {
mem::transmute(n)
}
pub struct SingleByteCharIndices<'a> {
inner: iter::Enumerate<slice::Iter<'a, u8>>,
}
impl<'a> Iterator for SingleByteCharIndices<'a> {
type Item = (usize, char);
#[inline]
fn next(&mut self) -> Option<(usize, char)> {
self.inner.next().map(|(i, &b)| unsafe {
(i, from_u32_unchecked(b as u32))
})
}
}
impl<'a> SingleByteCharIndices<'a> {
#[inline]
pub fn new(buf: &'a [u8]) -> SingleByteCharIndices<'a> {
SingleByteCharIndices {
inner: buf.iter().enumerate(),
}
}
}
}
pub unsafe trait Format {
fn validate(buf: &[u8]) -> bool;
#[inline]
fn validate_prefix(buf: &[u8]) -> bool {
<Self as Format>::validate(buf)
}
#[inline]
fn validate_suffix(buf: &[u8]) -> bool {
<Self as Format>::validate(buf)
}
#[inline]
fn validate_subseq(buf: &[u8]) -> bool {
<Self as Format>::validate(buf)
}
#[inline(always)]
unsafe fn fixup(_lhs: &[u8], _rhs: &[u8]) -> imp::Fixup {
Default::default()
}
}
pub unsafe trait SubsetOf<Super>: Format
where Super: Format,
{
fn revalidate_subset(x: &[u8]) -> bool {
Self::validate(x)
}
}
pub unsafe trait SliceFormat: Format + Sized {
type Slice: ?Sized + Slice;
}
pub unsafe trait CharFormat<'a>: Format {
type Iter: Iterator<Item=(usize, char)>;
unsafe fn char_indices(buf: &'a [u8]) -> Self::Iter;
fn encode_char<F>(ch: char, cont: F) -> Result<(), ()>
where F: FnOnce(&[u8]);
}
pub unsafe trait Slice {
fn as_bytes(&self) -> &[u8];
unsafe fn from_bytes(x: &[u8]) -> &Self;
unsafe fn from_mut_bytes(x: &mut [u8]) -> &mut Self;
}
#[derive(Copy, Clone, Default, Debug)]
pub struct Bytes;
unsafe impl Format for Bytes {
#[inline(always)]
fn validate(_: &[u8]) -> bool {
true
}
}
unsafe impl SliceFormat for Bytes {
type Slice = [u8];
}
unsafe impl Slice for [u8] {
#[inline(always)]
fn as_bytes(&self) -> &[u8] {
self
}
#[inline(always)]
unsafe fn from_bytes(x: &[u8]) -> &[u8] {
x
}
#[inline(always)]
unsafe fn from_mut_bytes(x: &mut [u8]) -> &mut [u8] {
x
}
}
#[derive(Copy, Clone, Default, Debug)]
pub struct ASCII;
unsafe impl Format for ASCII {
#[inline]
fn validate(buf: &[u8]) -> bool {
buf.iter().all(|&n| n <= 127)
}
#[inline(always)]
fn validate_prefix(_: &[u8]) -> bool {
true
}
#[inline(always)]
fn validate_suffix(_: &[u8]) -> bool {
true
}
#[inline(always)]
fn validate_subseq(_: &[u8]) -> bool {
true
}
}
unsafe impl SubsetOf<UTF8> for ASCII { }
unsafe impl SubsetOf<Latin1> for ASCII { }
unsafe impl<'a> CharFormat<'a> for ASCII {
type Iter = imp::SingleByteCharIndices<'a>;
#[inline]
unsafe fn char_indices(buf: &'a [u8]) -> imp::SingleByteCharIndices<'a> {
imp::SingleByteCharIndices::new(buf)
}
#[inline]
fn encode_char<F>(ch: char, cont: F) -> Result<(), ()>
where F: FnOnce(&[u8])
{
let n = ch as u32;
if n > 0x7F { return Err(()); }
cont(&[n as u8]);
Ok(())
}
}
#[derive(Copy, Clone, Default, Debug)]
pub struct UTF8;
unsafe impl Format for UTF8 {
#[inline]
fn validate(buf: &[u8]) -> bool {
str::from_utf8(buf).is_ok()
}
#[inline]
fn validate_prefix(buf: &[u8]) -> bool {
if buf.len() == 0 {
return true;
}
match futf::classify(buf, buf.len() - 1) {
Some(Codepoint { meaning: Meaning::Whole(_), .. }) => true,
_ => false,
}
}
#[inline]
fn validate_suffix(buf: &[u8]) -> bool {
if buf.len() == 0 {
return true;
}
match futf::classify(buf, 0) {
Some(Codepoint { meaning: Meaning::Whole(_), .. }) => true,
_ => false,
}
}
#[inline]
fn validate_subseq(buf: &[u8]) -> bool {
<Self as Format>::validate_prefix(buf)
&& <Self as Format>::validate_suffix(buf)
}
}
unsafe impl SubsetOf<WTF8> for UTF8 { }
unsafe impl SliceFormat for UTF8 {
type Slice = str;
}
unsafe impl Slice for str {
#[inline(always)]
fn as_bytes(&self) -> &[u8] {
str::as_bytes(self)
}
#[inline(always)]
unsafe fn from_bytes(x: &[u8]) -> &str {
str::from_utf8_unchecked(x)
}
#[inline(always)]
unsafe fn from_mut_bytes(x: &mut [u8]) -> &mut str {
mem::transmute(x)
}
}
unsafe impl<'a> CharFormat<'a> for UTF8 {
type Iter = str::CharIndices<'a>;
#[inline]
unsafe fn char_indices(buf: &'a [u8]) -> str::CharIndices<'a> {
str::from_utf8_unchecked(buf).char_indices()
}
#[inline]
fn encode_char<F>(ch: char, cont: F) -> Result<(), ()>
where F: FnOnce(&[u8])
{
unsafe {
let mut utf_8: [u8; 4] = mem::uninitialized();
let bytes_written = {
let mut buffer = &mut utf_8[..];
write!(buffer, "{}", ch).ok().expect("Tendril: internal error");
debug_assert!(buffer.len() <= 4);
4 - buffer.len()
};
cont(unsafe_slice(&utf_8, 0, bytes_written));
Ok(())
}
}
}
#[derive(Copy, Clone, Default, Debug)]
pub struct WTF8;
#[inline]
fn wtf8_meaningful(m: Meaning) -> bool {
match m {
Meaning::Whole(_) | Meaning::LeadSurrogate(_)
| Meaning::TrailSurrogate(_) => true,
_ => false,
}
}
unsafe impl Format for WTF8 {
#[inline]
fn validate(buf: &[u8]) -> bool {
let mut i = 0;
let mut prev_lead = false;
while i < buf.len() {
let codept = unwrap_or_return!(futf::classify(buf, i), false);
if !wtf8_meaningful(codept.meaning) {
return false;
}
i += codept.bytes.len();
prev_lead = match codept.meaning {
Meaning::TrailSurrogate(_) if prev_lead => return false,
Meaning::LeadSurrogate(_) => true,
_ => false,
};
}
true
}
#[inline]
fn validate_prefix(buf: &[u8]) -> bool {
if buf.len() == 0 {
return true;
}
match futf::classify(buf, buf.len() - 1) {
Some(c) => wtf8_meaningful(c.meaning),
_ => false,
}
}
#[inline]
fn validate_suffix(buf: &[u8]) -> bool {
if buf.len() == 0 {
return true;
}
match futf::classify(buf, 0) {
Some(c) => wtf8_meaningful(c.meaning),
_ => false,
}
}
#[inline]
fn validate_subseq(buf: &[u8]) -> bool {
<Self as Format>::validate_prefix(buf)
&& <Self as Format>::validate_suffix(buf)
}
#[inline]
unsafe fn fixup(lhs: &[u8], rhs: &[u8]) -> imp::Fixup {
const ERR: &'static str = "WTF8: internal error";
if lhs.len() >= 3 && rhs.len() >= 3 {
if let (Some(Codepoint { meaning: Meaning::LeadSurrogate(hi), .. }),
Some(Codepoint { meaning: Meaning::TrailSurrogate(lo), .. }))
= (futf::classify(lhs, lhs.len() - 1), futf::classify(rhs, 0))
{
let mut fixup = imp::Fixup {
drop_left: 3,
drop_right: 3,
insert_len: 0,
insert_bytes: mem::uninitialized(),
};
let n = 0x10000 + ((hi as u32) << 10) + (lo as u32);
fixup.insert_len = {
let mut buffer = &mut fixup.insert_bytes[..];
write!(buffer, "{}", char::from_u32(n).expect(ERR)).ok().expect(ERR);
debug_assert!(buffer.len() <= 4);
4 - buffer.len() as u32
};
return fixup;
}
}
Default::default()
}
}
#[derive(Copy, Clone, Default, Debug)]
pub struct Latin1;
unsafe impl Format for Latin1 {
#[inline(always)]
fn validate(_: &[u8]) -> bool {
true
}
#[inline(always)]
fn validate_prefix(_: &[u8]) -> bool {
true
}
#[inline(always)]
fn validate_suffix(_: &[u8]) -> bool {
true
}
#[inline(always)]
fn validate_subseq(_: &[u8]) -> bool {
true
}
}
unsafe impl<'a> CharFormat<'a> for Latin1 {
type Iter = imp::SingleByteCharIndices<'a>;
#[inline]
unsafe fn char_indices(buf: &'a [u8]) -> imp::SingleByteCharIndices<'a> {
imp::SingleByteCharIndices::new(buf)
}
#[inline]
fn encode_char<F>(ch: char, cont: F) -> Result<(), ()>
where F: FnOnce(&[u8])
{
let n = ch as u32;
if n > 0xFF { return Err(()); }
cont(&[n as u8]);
Ok(())
}
}