#![doc = include_str!("../README.md")]
#![cfg_attr(not(any(test, feature = "std")), no_std)]
#![warn(missing_docs)]
#![forbid(unsafe_code)]
#![allow(clippy::unusual_byte_groupings)]
mod error;
pub use error::Utf8ParserError;
const FIRST_CODE_POINT_FOR_DOUBLE: u32 = 0x80;
const FIRST_CODE_POINT_FOR_TRIPLE: u32 = 0x800;
const FIRST_CODE_POINT_FOR_QUADRUPLE: u32 = 0x10000;
#[derive(Copy, Clone, PartialEq, Eq, Debug, Hash)]
pub enum Utf8ByteType {
Continuation,
Single,
Double,
Triple,
Quadruple,
}
impl Utf8ByteType {
pub fn of(byte: u8) -> Result<Self, Utf8ParserError> {
use Utf8ByteType::*;
let kinds = [Continuation, Single, Double, Triple, Quadruple];
for kind in kinds {
if kind.matches(byte) {
return Ok(kind);
}
}
Err(Utf8ParserError::InvalidByte(byte))
}
pub const fn is_continuation(self) -> bool {
matches!(self, Self::Continuation)
}
const fn id(self) -> u8 {
match self {
Self::Single => 0b0,
Self::Continuation => 0b10,
Self::Double => 0b110,
Self::Triple => 0b1110,
Self::Quadruple => 0b11110,
}
}
const fn id_length(self) -> u32 {
self.id().count_ones() + 1
}
const fn value_mask(self) -> u8 {
0xFF >> self.id_length()
}
const fn value_mask_length(self) -> u32 {
self.value_mask().count_ones()
}
const fn matches(self, byte: u8) -> bool {
(byte >> self.value_mask_length()) == self.id()
}
}
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
enum ParsedByte {
Single(u8),
StartDouble(u8),
StartTriple(u8),
StartQuadruple(u8),
ContinuationByte(u8),
}
impl ParsedByte {
fn from_byte(byte: u8) -> Result<Self, Utf8ParserError> {
use Utf8ByteType::*;
let kind = Utf8ByteType::of(byte)?;
let value = byte & kind.value_mask();
Ok(match kind {
Continuation => Self::ContinuationByte(value),
Single => Self::Single(value),
Double => Self::StartDouble(value),
Triple => Self::StartTriple(value),
Quadruple => Self::StartQuadruple(value),
})
}
}
#[derive(Copy, Clone, Debug)]
enum State {
Fresh,
OneLeft(u32),
TwoLeft(u32),
ThreeLeft(u32),
}
const fn push_byte(current: u32, byte: u8) -> u32 {
debug_assert!(current <= 0x00FFFFFF);
debug_assert!(byte <= 0b0011_1111);
(current << Utf8ByteType::Continuation.value_mask_length()) | (byte as u32)
}
#[derive(Clone, Debug)]
pub struct Utf8Parser {
state: State,
}
impl Utf8Parser {
pub const fn new() -> Self {
Self {
state: State::Fresh,
}
}
pub fn push(&mut self, byte: u8) -> Result<Option<char>, Utf8ParserError> {
match self.push_inner_impl(byte) {
Ok(val) => Ok(val),
Err(val) => {
self.reset();
Err(val)
}
}
}
fn push_inner_impl(&mut self, byte: u8) -> Result<Option<char>, Utf8ParserError> {
let byte = ParsedByte::from_byte(byte)?;
match (self.state, byte) {
(State::OneLeft(current), ParsedByte::ContinuationByte(value)) => {
self.state = State::Fresh;
let val = push_byte(current, value);
if val < FIRST_CODE_POINT_FOR_DOUBLE {
return Err(Utf8ParserError::OverlongEncoding);
}
Ok(Some(
char::try_from(val).map_err(|_| Utf8ParserError::InvalidChar(val))?,
))
}
(State::TwoLeft(current), ParsedByte::ContinuationByte(value)) => {
let val = push_byte(current, value);
if val << Utf8ByteType::Continuation.value_mask_length()
< FIRST_CODE_POINT_FOR_TRIPLE
{
return Err(Utf8ParserError::OverlongEncoding);
}
self.state = State::OneLeft(val);
Ok(None)
}
(State::ThreeLeft(current), ParsedByte::ContinuationByte(value)) => {
let val = push_byte(current, value);
if val << (2 * Utf8ByteType::Continuation.value_mask_length())
< FIRST_CODE_POINT_FOR_QUADRUPLE
{
return Err(Utf8ParserError::OverlongEncoding);
}
self.state = State::TwoLeft(val);
Ok(None)
}
(State::Fresh, ParsedByte::Single(value)) => Ok(Some(value as char)),
(State::Fresh, ParsedByte::StartDouble(value)) => {
self.state = State::OneLeft(value as u32);
Ok(None)
}
(State::Fresh, ParsedByte::StartTriple(value)) => {
self.state = State::TwoLeft(value as u32);
Ok(None)
}
(State::Fresh, ParsedByte::StartQuadruple(value)) => {
self.state = State::ThreeLeft(value as u32);
Ok(None)
}
(
State::OneLeft(_) | State::TwoLeft(_) | State::ThreeLeft(_),
ParsedByte::Single(value)
| ParsedByte::StartDouble(value)
| ParsedByte::StartTriple(value)
| ParsedByte::StartQuadruple(value),
) => Err(Utf8ParserError::UnexpectedStartByte(value)),
(State::Fresh, ParsedByte::ContinuationByte(value)) => {
Err(Utf8ParserError::UnexpectedContinuationByte(value))
}
}
}
fn reset(&mut self) {
self.state = State::Fresh;
}
}
impl Default for Utf8Parser {
fn default() -> Self {
Self::new()
}
}
#[cfg(test)]
mod tests {
use super::*;
use rand::Rng;
#[test]
fn conversion() -> Result<(), Utf8ParserError> {
let test_vectors = &[
(0x00, ParsedByte::Single(0x00)),
(0x01, ParsedByte::Single(0x01)),
(0x65, ParsedByte::Single(0x65)),
(0x7f, ParsedByte::Single(0x7f)),
(0b110_00000, ParsedByte::StartDouble(0)),
(0b110_00001, ParsedByte::StartDouble(0b1)),
(0b110_11001, ParsedByte::StartDouble(0b11001)),
(0b110_11111, ParsedByte::StartDouble(0b11111)),
(0b1110_0000, ParsedByte::StartTriple(0)),
(0b1110_0001, ParsedByte::StartTriple(0b1)),
(0b1110_1001, ParsedByte::StartTriple(0b1001)),
(0b1110_1111, ParsedByte::StartTriple(0b1111)),
(0b1111_0000, ParsedByte::StartQuadruple(0)),
(0b1111_0001, ParsedByte::StartQuadruple(0b1)),
(0b1111_0111, ParsedByte::StartQuadruple(0b111)),
(0x80, ParsedByte::ContinuationByte(0x00)),
(0x81, ParsedByte::ContinuationByte(0x01)),
(0b10_111111, ParsedByte::ContinuationByte(0b111111)),
];
for tv in test_vectors.iter() {
assert_eq!(ParsedByte::from_byte(tv.0)?, tv.1);
}
Ok(())
}
#[test]
fn basic() -> Result<(), Utf8ParserError> {
let mut parser = Utf8Parser::default();
assert_eq!(parser.push(b'h')?, Some('h'));
assert_eq!(parser.push(b'e')?, Some('e'));
assert_eq!(parser.push(b'l')?, Some('l'));
assert_eq!(parser.push(b'l')?, Some('l'));
assert_eq!(parser.push(b'o')?, Some('o'));
assert_eq!(parser.push(0b1101_0000)?, None);
Ok(())
}
fn parse_str_by_bytes(original: &[u8]) -> Result<String, Utf8ParserError> {
let mut rebuilt = String::new();
let mut parser = Utf8Parser::default();
for byte in original {
if let Some(c) = parser.push(*byte)? {
rebuilt.push(c);
}
}
assert_eq!(String::from_utf8(original.into()).unwrap(), rebuilt);
Ok(rebuilt)
}
#[test]
fn parse_ascii_stream() -> Result<(), Utf8ParserError> {
parse_str_by_bytes("The quick brown fox jamped over the lazy dog".as_bytes())?;
Ok(())
}
#[test]
fn parse_emoji_stream() -> Result<(), Utf8ParserError> {
parse_str_by_bytes("Thé quick brown 🦊 jamped over the lazy 🐕".as_bytes())?;
Ok(())
}
#[test]
fn reset_state_after_error() {
let mut parser = Utf8Parser::default();
assert!(parser.push(0b1110_0000).is_ok());
assert!(parser.push(0b1111_1110).is_err());
assert_eq!(parser.push(b'a'), Ok(Some('a')));
}
#[test]
fn error_on_overlong_encodings() {
let good: Vec<(&[u8], u32)> = vec![
(&[0b0_0000000], 0x00),
(&[0b0_1111111], 0x7f),
(&[0b110_00010, 0b10_000000], 0x80),
(&[0b110_11111, 0b10_111111], 0x7ff),
(&[0b1110_0000, 0b10_100000, 0b10_000000], 0x800),
(&[0b1110_1111, 0b10_111111, 0b10_111111], 0xFFFF),
(
&[0b11110_000, 0b10_010000, 0b10_000000, 0b10_000000],
0x10000,
),
(
&[0b11110_100, 0b10_001111, 0b10_111111, 0b10_111111],
0x10FFFF,
),
];
let overlong: Vec<&[u8]> = vec![
&[0b110_00000, 0b10_000000],
&[0b110_00001, 0b10_111111],
&[0b1110_0000, 0b10_000000, 0b10_000000],
&[0b1110_0000, 0b10_011111, 0b10_111111],
&[0b11110_000, 0b10_000000, 0b10_000000, 0b10_000000],
&[0b11110_000, 0b10_001111, 0b10_000000, 0b10_111111],
];
let err_but_not_overlong: Vec<&[u8]> = vec![
&[0b11110_110, 0b10_000000, 0b10_000000, 0b10_000000],
];
for tv in good {
assert_eq!(
parse_str_by_bytes(tv.0).unwrap().chars().next().unwrap() as u32,
tv.1
);
}
for tv in overlong {
assert_eq!(
parse_str_by_bytes(tv).unwrap_err(),
Utf8ParserError::OverlongEncoding
);
}
for tv in err_but_not_overlong {
assert_ne!(
parse_str_by_bytes(tv).unwrap_err(),
Utf8ParserError::OverlongEncoding
);
}
}
#[test]
fn random_input_dont_panic() {
let mut parser = Utf8Parser::default();
let mut rng = rand::thread_rng();
for _ in 0..1_000_000 {
let _ = parser.push(rng.gen());
}
}
#[test]
fn random_ascii_dont_error() {
let mut parser = Utf8Parser::default();
let mut rng = rand::thread_rng();
for _ in 0..1_000_000 {
let val: u8 = rng.gen();
parser.push(val % 0x80).unwrap();
}
}
}