use std::char;
const TAG_CONT: u8 = 0b1000_0000;
const TAG_TWO: u8 = 0b1100_0000;
const TAG_THREE: u8 = 0b1110_0000;
const TAG_FOUR: u8 = 0b1111_0000;
#[allow(dead_code)]
#[inline]
pub fn encode_utf8(character: char, dst: &mut [u8]) -> Option<usize> {
let code = character as u32;
if code <= 0x7F && !dst.is_empty() {
dst[0] = code as u8;
Some(1)
} else if code <= 0x7FF && dst.len() >= 2 {
dst[0] = (code >> 6 & 0x1F) as u8 | TAG_TWO;
dst[1] = (code & 0x3F) as u8 | TAG_CONT;
Some(2)
} else if code <= 0xFFFF && dst.len() >= 3 {
dst[0] = (code >> 12 & 0x0F) as u8 | TAG_THREE;
dst[1] = (code >> 6 & 0x3F) as u8 | TAG_CONT;
dst[2] = (code & 0x3F) as u8 | TAG_CONT;
Some(3)
} else if dst.len() >= 4 {
dst[0] = (code >> 18 & 0x07) as u8 | TAG_FOUR;
dst[1] = (code >> 12 & 0x3F) as u8 | TAG_CONT;
dst[2] = (code >> 6 & 0x3F) as u8 | TAG_CONT;
dst[3] = (code & 0x3F) as u8 | TAG_CONT;
Some(4)
} else {
None
}
}
#[inline]
pub fn decode_utf8(src: &[u8]) -> Option<(char, usize)> {
let b0 = match src.get(0) {
None => return None,
Some(&b) if b <= 0x7F => return Some((b as char, 1)),
Some(&b) => b,
};
match b0 {
0b110_00000 ... 0b110_11111 => {
if src.len() < 2 {
return None;
}
let b1 = src[1];
let cp = ((b0 & !TAG_TWO) as u32) << 6
| ((b1 & !TAG_CONT) as u32);
match cp {
0x80 ... 0x7FF => char::from_u32(cp).map(|cp| (cp, 2)),
_ => None,
}
}
0b1110_0000 ... 0b1110_1111 => {
if src.len() < 3 {
return None;
}
let (b1, b2) = (src[1], src[2]);
let cp = ((b0 & !TAG_THREE) as u32) << 12
| ((b1 & !TAG_CONT) as u32) << 6
| ((b2 & !TAG_CONT) as u32);
match cp {
0x800 ... 0xFFFF => char::from_u32(cp).map(|cp| (cp, 3)),
_ => None,
}
}
0b11110_000 ... 0b11110_111 => {
if src.len() < 4 {
return None;
}
let (b1, b2, b3) = (src[1], src[2], src[3]);
let cp = ((b0 & !TAG_FOUR) as u32) << 18
| ((b1 & !TAG_CONT) as u32) << 12
| ((b2 & !TAG_CONT) as u32) << 6
| ((b3 & !TAG_CONT) as u32);
match cp {
0x10000 ... 0x10FFFF => char::from_u32(cp).map(|cp| (cp, 4)),
_ => None,
}
}
_ => None,
}
}
pub fn decode_last_utf8(src: &[u8]) -> Option<(char, usize)> {
if src.is_empty() {
return None;
}
let mut start = src.len() - 1;
if src[start] <= 0x7F {
return Some((src[start] as char, 1));
}
while start > src.len().saturating_sub(4) {
start -= 1;
if is_start_byte(src[start]) {
break;
}
}
match decode_utf8(&src[start..]) {
None => None,
Some((_, n)) if n < src.len() - start => None,
Some((cp, n)) => Some((cp, n)),
}
}
fn is_start_byte(b: u8) -> bool {
b & 0b11_000000 != 0b1_0000000
}
#[cfg(test)]
mod tests {
use std::str;
use quickcheck::quickcheck;
use super::{
TAG_CONT, TAG_TWO, TAG_THREE, TAG_FOUR,
decode_utf8, decode_last_utf8, encode_utf8,
};
#[test]
fn prop_roundtrip() {
fn p(given_cp: char) -> bool {
let mut tmp = [0; 4];
let encoded_len = encode_utf8(given_cp, &mut tmp).unwrap();
let (got_cp, got_len) = decode_utf8(&tmp[..encoded_len]).unwrap();
encoded_len == got_len && given_cp == got_cp
}
quickcheck(p as fn(char) -> bool)
}
#[test]
fn prop_roundtrip_last() {
fn p(given_cp: char) -> bool {
let mut tmp = [0; 4];
let encoded_len = encode_utf8(given_cp, &mut tmp).unwrap();
let (got_cp, got_len) =
decode_last_utf8(&tmp[..encoded_len]).unwrap();
encoded_len == got_len && given_cp == got_cp
}
quickcheck(p as fn(char) -> bool)
}
#[test]
fn prop_encode_matches_std() {
fn p(cp: char) -> bool {
let mut got = [0; 4];
let n = encode_utf8(cp, &mut got).unwrap();
let expected = cp.to_string();
&got[..n] == expected.as_bytes()
}
quickcheck(p as fn(char) -> bool)
}
#[test]
fn prop_decode_matches_std() {
fn p(given_cp: char) -> bool {
let mut tmp = [0; 4];
let n = encode_utf8(given_cp, &mut tmp).unwrap();
let (got_cp, _) = decode_utf8(&tmp[..n]).unwrap();
let expected_cp =
str::from_utf8(&tmp[..n]).unwrap().chars().next().unwrap();
got_cp == expected_cp
}
quickcheck(p as fn(char) -> bool)
}
#[test]
fn prop_decode_last_matches_std() {
fn p(given_cp: char) -> bool {
let mut tmp = [0; 4];
let n = encode_utf8(given_cp, &mut tmp).unwrap();
let (got_cp, _) = decode_last_utf8(&tmp[..n]).unwrap();
let expected_cp =
str::from_utf8(&tmp[..n]).unwrap()
.chars().rev().next().unwrap();
got_cp == expected_cp
}
quickcheck(p as fn(char) -> bool)
}
#[test]
fn reject_invalid() {
assert_eq!(decode_utf8(&[0xFF]), None);
assert_eq!(decode_utf8(&[0xED, 0xA0, 0x81]), None);
assert_eq!(decode_utf8(&[0xC3]), None); assert_eq!(decode_utf8(&[0xEF, 0xBF]), None); assert_eq!(decode_utf8(&[0xF4, 0x8F, 0xBF]), None); assert_eq!(decode_utf8(&[TAG_TWO, TAG_CONT | b'a']), None);
assert_eq!(decode_utf8(&[TAG_THREE, TAG_CONT, TAG_CONT | b'a']), None);
assert_eq!(decode_utf8(&[
TAG_FOUR, TAG_CONT, TAG_CONT, TAG_CONT | b'a',
]), None);
}
#[test]
fn reject_invalid_last() {
assert_eq!(decode_last_utf8(&[0xFF]), None);
assert_eq!(decode_last_utf8(&[0xED, 0xA0, 0x81]), None);
assert_eq!(decode_last_utf8(&[0xC3]), None); assert_eq!(decode_last_utf8(&[0xEF, 0xBF]), None); assert_eq!(decode_last_utf8(&[0xF4, 0x8F, 0xBF]), None); assert_eq!(decode_last_utf8(&[TAG_TWO, TAG_CONT | b'a']), None);
assert_eq!(decode_last_utf8(&[
TAG_THREE, TAG_CONT, TAG_CONT | b'a',
]), None);
assert_eq!(decode_last_utf8(&[
TAG_FOUR, TAG_CONT, TAG_CONT, TAG_CONT | b'a',
]), None);
}
}