use std::char::from_u32;
use std::ops::RangeInclusive;
use thiserror::Error;
const LOW_UTF_16_SURROGATE: u32 = 0xD800;
const HIGH_UTF_16_SURROGATE: u32 = 0xDFFF;
enum CachedValue {
None,
Byte(u8),
Eof,
}
pub struct Utf8Iterator<R>
where
R: Iterator,
{
inner: R,
_cache: CachedValue,
_unget: Option<char>,
}
impl<R> Utf8Iterator<R>
where
R: Iterator<Item = Result<u8, std::io::Error>>,
{
pub fn new(inner: R) -> Self {
Utf8Iterator {
inner,
_cache: CachedValue::None,
_unget: None,
}
}
pub fn unget(&mut self, ch: char) {
match self._unget {
None => self._unget = Some(ch),
Some(_) => {
panic!("Cannot return character before consuming the previous cached value.")
}
}
}
fn take_unget(&mut self) -> Option<char> {
self._unget.take()
}
fn uncache_or_next(&mut self) -> Option<R::Item> {
let result = match self.cache() {
CachedValue::Byte(b) => Some(Ok(*b)),
CachedValue::Eof => None,
CachedValue::None => 'ignore_interruption: loop {
if let Some(item) = self.inner.next() {
match item {
Ok(b) => break Some(Ok(b)),
Err(e) => {
if e.kind() == std::io::ErrorKind::Interrupted {
continue 'ignore_interruption;
} else {
break Some(Err(e));
}
}
};
} else {
break None;
}
},
};
*self.cache_mut() = CachedValue::None;
result
}
fn cache(&self) -> &CachedValue {
&self._cache
}
fn cache_mut(&mut self) -> &mut CachedValue {
&mut self._cache
}
}
#[derive(Error, Debug)]
pub enum Utf8IteratorError {
#[error("IO Error from the wrapped iterator while decoding sequence: {0:?}")]
IoError(std::io::Error, Box<[u8]>),
#[error("The decoder found a malformed sequence while decoding sequence: {0:?}")]
InvalidSequenceError(Box<[u8]>),
#[error("The sequence is well formed, but it is too long (more than 4 bytes). Sequence: {0:?}")]
LongSequenceError(Box<[u8]>),
#[error("Found a well formed UTF-8 sequence, nevertheless the value does not represent a valid character. Sequence: {0:?}")]
InvalidCharError(Box<[u8]>),
}
use crate::Utf8IteratorError::*;
impl<R> Iterator for Utf8Iterator<R>
where
R: Iterator<Item = Result<u8, std::io::Error>>,
{
type Item = Result<char, Utf8IteratorError>;
fn next(&mut self) -> Option<Self::Item> {
fn length_first_bits_and_valid_range(first_byte: u8) -> (usize, u32, RangeInclusive<u32>) {
macro_rules! mktest {
($nbits:literal, $mask:literal, $range:expr) => {
if first_byte & $mask == ($mask << 1) {
return ($nbits, u32::from(first_byte & !$mask), $range);
}
};
}
if first_byte & 0b_1000_0000_u8 == 0 {
return (1, u32::from(first_byte), 0x00..=0xf7);
}
mktest!(2, 0b_1110_0000_u8, 0x0080..=0x07ff);
mktest!(3, 0b_1111_0000_u8, 0x0800..=0xffff);
mktest!(4, 0b_1111_1000_u8, 0x10000..=0x10ffff);
mktest!(5, 0b_1111_1100_u8, 0..=0);
mktest!(6, 0b_1111_1110_u8, 0..=0);
return (0, 0u32, 0..=0);
}
macro_rules! err {
($err:ident, $slice:ident) => {
Some(Err($err($slice.into_boxed_slice())))
};
($err:ident, $nested:ident, $slice:ident) => {
Some(Err($err($nested, $slice.into_boxed_slice())))
};
($err:ident, $nested:ident, $slice:expr) => {
Some(Err($err($nested, $slice.into_boxed_slice())))
};
}
macro_rules! is_not_in_surrogate_range {
($value:ident) => {
$value <= LOW_UTF_16_SURROGATE || HIGH_UTF_16_SURROGATE <= $value
};
}
macro_rules! is_not_byte_order_mark {
($value:ident) => {
$value != 0xfffe
};
}
macro_rules! is_not_not_char {
($value:ident) => {
$value != 0xffff
};
}
if let Some(ch) = self.take_unget() {
return Some(Ok(ch));
} else if let Some(has_input) = self.uncache_or_next() {
match has_input {
Err(e) => return err![IoError, e, Vec::<u8>::new()],
Ok(first_byte) => {
let mut seq = Vec::<u8>::new();
seq.push(first_byte);
let (nbytes, mut builder, range) =
length_first_bits_and_valid_range(first_byte);
if nbytes >= 1 {
while seq.len() < nbytes {
if let Some(has_input) = self.uncache_or_next() {
match has_input {
Err(e) => return err![IoError, e, seq],
Ok(next_byte) => {
if next_byte & 0xC0u8 == 0x80u8 {
seq.push(next_byte);
builder =
(builder << 6) | u32::from(next_byte & 0x3Fu8);
} else {
*self.cache_mut() = CachedValue::Byte(next_byte);
return err![InvalidSequenceError, seq];
}
}
}
} else {
*self.cache_mut() = CachedValue::Eof;
return err![InvalidSequenceError, seq];
}
}
if nbytes < 5 {
if range.contains(&builder)
&& is_not_in_surrogate_range!(builder)
&& is_not_byte_order_mark!(builder)
&& is_not_not_char!(builder)
{
if let Some(ch) = from_u32(builder) {
return Some(Ok(ch));
} else {
return err![InvalidCharError, seq];
}
} else {
return err![InvalidCharError, seq];
}
} else {
return err![LongSequenceError, seq];
}
} else {
return err![InvalidSequenceError, seq];
}
}
}
} else {
*self.cache_mut() = CachedValue::Eof;
return None;
}
}
fn size_hint(&self) -> (usize, Option<usize>) {
self.inner.size_hint()
}
}
#[cfg(test)]
mod tests {
use super::*;
use core::fmt::Debug;
use std::io::prelude::*;
use std::io::BufReader;
use std::io::Cursor;
use tempfile::tempfile;
macro_rules! match_char_and_sequence {
($ch:expr; $($x:expr),*) => {
let input: Vec<u8> = vec![ $($x),* ];
let mut chiter = Utf8Iterator::new(Cursor::new(input).bytes());
assert_eq!($ch, chiter.next().unwrap().unwrap());
assert!(chiter.next().is_none());
};
}
macro_rules! match_err_and_sequence {
($err:ident; $($x:expr),*) => {
let input: Vec<u8> = vec![ $($x),* ];
let mut chiter = Utf8Iterator::new(Cursor::new(input).bytes());
let value = chiter.next().unwrap();
if let Err($err(bytes)) = value {
assert_eq!(vec![ $($x),* ].into_boxed_slice(), bytes)
} else {
panic!("Expecting:{:?}, found: {:?}", stringify!(Err(Utf8IteratorError { $err: [$($x),*]})), value);
}
assert!(chiter.next().is_none());
};
($chiter:ident; $err:ident; $($x:expr),*) => {
let value = $chiter.next().unwrap();
if let Err($err(bytes)) = value {
assert_eq!(vec![ $($x),* ].into_boxed_slice(), bytes)
} else {
panic!("Expecting:{:?}, found: {:?}", stringify!(Err(Utf8IteratorError { $err: [$($x),*]})), value);
}
};
($err:ident; $($x:expr),*; $($y:expr),*) => {
let input: Vec<u8> = vec![ $($x),* ];
let mut chiter = Utf8Iterator::new(Cursor::new(input).bytes());
let value = chiter.next().unwrap();
if let Err($err(bytes)) = value {
assert_eq!(vec![ $($y),* ].into_boxed_slice(), bytes)
} else {
panic!("Expecting:{:?}, found: {:?}", stringify!(Err(Utf8IteratorError { $err: [$($y),*]})), value);
}
assert!(chiter.next().is_none());
};
}
macro_rules! match_incomplete {
($chiter:ident; $($seq:expr),*) => {
let value = $chiter.next().unwrap();
if let Err(InvalidSequenceError(bytes)) = value {
assert_eq!(vec![ $($seq),* ].into_boxed_slice(), bytes)
} else {
panic!(value);
}
};
}
#[test]
fn _1_some_correct_utf_8_text() {
let input: Vec<u8> = vec![
0xce, 0xba, 0xe1, 0xbd, 0xb9, 0xcf, 0x83, 0xce, 0xbc, 0xce, 0xb5,
];
let stream = Cursor::new(input);
let iter = stream.bytes();
let mut chiter = Utf8Iterator::new(iter);
assert_eq!('κ', chiter.next().unwrap().unwrap());
assert_eq!('ό', chiter.next().unwrap().unwrap());
assert_eq!('σ', chiter.next().unwrap().unwrap());
assert_eq!('μ', chiter.next().unwrap().unwrap());
assert_eq!('ε', chiter.next().unwrap().unwrap());
assert!(chiter.next().is_none());
}
#[test]
fn _2_1_first_possible_sequence_of_a_certain_length() {
match_char_and_sequence!['\u{80}'; 0xc2, 0x80 ];
match_char_and_sequence!['\u{800}'; 0xe0, 0xa0, 0x80 ];
match_char_and_sequence!['\u{10000}'; 0xf0, 0x90, 0x80, 0x80 ];
match_err_and_sequence![LongSequenceError; 0b11111000, 0b10000000, 0b10000000, 0b10000000, 0b10000000];
match_err_and_sequence![LongSequenceError; 0b11111100, 0b10000000, 0b10000000, 0b10000000, 0b10000000, 0b10000000 ];
}
#[test]
fn _2_2_last_possible_sequence_of_a_certain_length() {
match_char_and_sequence!['\u{7f}'; 0b_0111_1111];
match_char_and_sequence!['\u{7FF}'; 0b_1101_1111, 0b_1011_1111];
match_err_and_sequence![InvalidCharError; 0b_1111_0111, 0b1011_1111, 0b1011_1111, 0b1011_1111];
match_char_and_sequence!['\u{10FFFF}'; 0b_1111_0100, 0b1000_1111, 0b1011_1111, 0b1011_1111];
match_err_and_sequence![LongSequenceError; 0b_1111_1011, 0b1011_1111, 0b1011_1111, 0b1011_1111, 0b1011_1111];
match_err_and_sequence![LongSequenceError; 0b_1111_1101, 0b1011_1111, 0b1011_1111, 0b1011_1111, 0b1011_1111, 0b1011_1111 ];
}
#[test]
fn _2_3_other_boundary_conditions() {
match_char_and_sequence!['\u{00D7FF}'; 0xed, 0x9f, 0xbf ];
match_char_and_sequence!['\u{00E000}'; 0xee, 0x80, 0x80 ];
match_char_and_sequence!['\u{00FFFD}'; 0xef, 0xbf, 0xbd ];
match_char_and_sequence!['\u{10FFFF}'; 0xf4, 0x8f, 0xbf, 0xbf ];
match_err_and_sequence![InvalidCharError; 0xf4u8, 0x90u8, 0x80u8, 0x80u8 ];
}
#[test]
fn _3_1_unexpected_continuation_bytes() {
match_err_and_sequence![InvalidSequenceError; 0x80 ];
match_err_and_sequence![InvalidSequenceError; 0xbf ];
match_char_and_sequence!['\u{80}'; 0b110_0_0010, 0b10_00_0000 ];
assert_eq!('\u{80}', from_u32(0x80).unwrap());
let seq: Vec<u8> = vec![0x80, 0xbf, 0x81, 0xb0, 0x80, 0xbf];
let cmp: Vec<u8> = vec![0x80, 0xbf, 0x81, 0xb0, 0x80, 0xbf];
let len = seq.len();
let stream = Cursor::new(seq);
let buffered = BufReader::new(stream);
let iter = buffered.bytes();
let mut chiter = Utf8Iterator::new(iter);
for i in 0..len {
if let Err(InvalidSequenceError(bytes)) = chiter.next().unwrap() {
assert_eq!(cmp[i], bytes[0]);
}
}
assert!(chiter.next().is_none());
let mut seq: Vec<u8> = vec![];
for i in 0u8..64u8 {
seq.push(i | 0b_1000_0000u8)
}
let cmp: Vec<u8> = seq.clone();
let len = seq.len();
let stream = Cursor::new(seq);
let buffered = BufReader::new(stream);
let iter = buffered.bytes();
let mut chiter = Utf8Iterator::new(iter);
for i in 0..len {
if let Err(InvalidSequenceError(bytes)) = chiter.next().unwrap() {
assert_eq!(cmp[i], bytes[0]);
}
}
assert!(chiter.next().is_none());
}
#[test]
fn _3_2_lonely_start_characters() {
macro_rules! test_lonely_start {
($range:expr) => {
let mut seq: Vec<u8> = vec![];
for i in $range {
seq.push(i);
seq.push(0x20);
}
let cmp: Vec<u8> = seq.clone();
let len = seq.len();
let stream = Cursor::new(seq);
let buffered = BufReader::new(stream);
let iter = buffered.bytes();
let mut chiter = Utf8Iterator::new(iter);
for i in 0..len / 2 {
if let Err(InvalidSequenceError(bytes)) = chiter.next().unwrap() {
assert_eq!(&cmp[i * 2..i * 2 + 1], bytes.as_ref());
}
if let Ok(ch) = chiter.next().unwrap() {
assert_eq!(ch, ' ');
}
}
assert!(chiter.next().is_none());
};
}
test_lonely_start![0xC0u8..=0xdfu8];
test_lonely_start![0xe0u8..=0xefu8];
test_lonely_start![0xf0u8..=0xf7u8];
test_lonely_start![0xf8u8..=0xfbu8];
test_lonely_start![0xfcu8..=0xfdu8];
}
#[test]
fn _3_3_sequences_with_last_continuation_byte_missing() {
match_err_and_sequence![InvalidSequenceError; 0b1100_0000 ];
match_err_and_sequence![InvalidSequenceError; 0b1110_0000, 0b1000_0000 ];
match_err_and_sequence![InvalidSequenceError; 0b1111_0000, 0b1000_0000 , 0b1000_0000 ];
match_err_and_sequence![InvalidSequenceError; 0b1111_1000, 0b1000_0000 , 0b1000_0000 , 0b1000_0000 ];
match_err_and_sequence![InvalidSequenceError; 0b1111_1100, 0b1000_0000 , 0b1000_0000 , 0b1000_0000 , 0b1000_0000 ];
match_err_and_sequence![InvalidSequenceError; 0b1100_1111 ];
match_err_and_sequence![InvalidSequenceError; 0b1110_0111, 0b1011_1111 ];
match_err_and_sequence![InvalidSequenceError; 0b1111_0111, 0b1011_1111 , 0b1011_1111 ];
match_err_and_sequence![InvalidSequenceError; 0b1111_1011, 0b1011_1111 , 0b1011_1111 , 0b1011_1111 ];
match_err_and_sequence![InvalidSequenceError; 0b1111_1101, 0b1011_1111 , 0b1011_1111 , 0b1011_1111 , 0b1011_1111 ];
}
#[test]
fn _3_4_concatenation_of_incomplete_sequences() {
let input: Vec<u8> = vec![
0b1100_0000,
0b1110_0000,
0b1000_0000,
0b1111_0000,
0b1000_0000,
0b1000_0000,
0b1111_1000,
0b1000_0000,
0b1000_0000,
0b1000_0000,
0b1111_1100,
0b1000_0000,
0b1000_0000,
0b1000_0000,
0b1000_0000,
0b1100_1111,
0b1110_0111,
0b1011_1111,
0b1111_0111,
0b1011_1111,
0b1011_1111,
0b1111_1011,
0b1011_1111,
0b1011_1111,
0b1011_1111,
0b1111_1101,
0b1011_1111,
0b1011_1111,
0b1011_1111,
0b1011_1111,
];
let mut chiter = Utf8Iterator::new(Cursor::new(input).bytes());
match_incomplete![ chiter; 0b1100_0000];
match_incomplete![ chiter; 0b1110_0000, 0b1000_0000];
match_incomplete![ chiter; 0b1111_0000, 0b1000_0000, 0b1000_0000];
match_incomplete![ chiter; 0b1111_1000, 0b1000_0000, 0b1000_0000, 0b1000_0000];
match_incomplete![ chiter;
0b1111_1100,
0b1000_0000,
0b1000_0000,
0b1000_0000,
0b1000_0000
];
match_incomplete![ chiter; 0b1100_1111];
match_incomplete![ chiter; 0b1110_0111, 0b1011_1111];
match_incomplete![ chiter; 0b1111_0111, 0b1011_1111, 0b1011_1111];
match_incomplete![ chiter; 0b1111_1011, 0b1011_1111, 0b1011_1111, 0b1011_1111];
match_incomplete![ chiter;
0b1111_1101,
0b1011_1111,
0b1011_1111,
0b1011_1111,
0b1011_1111
];
assert!(chiter.next().is_none());
}
#[test]
fn _4_1_examples_of_an_overlong_ascii_character() {
match_err_and_sequence!(InvalidCharError; 0xc0, 0xaf);
match_err_and_sequence!(InvalidCharError; 0xe0, 0x80, 0xaf);
match_err_and_sequence!(InvalidCharError; 0xf0, 0x80, 0x80, 0xaf);
match_err_and_sequence!(LongSequenceError; 0xf8, 0x80, 0x80, 0x80, 0xaf);
match_err_and_sequence!(LongSequenceError; 0xfc, 0x80, 0x80, 0x80, 0x80, 0xaf);
}
#[test]
fn _4_2_maximum_overlong_sequences() {
match_err_and_sequence!(InvalidCharError; 0xc1, 0xbf);
match_err_and_sequence!(InvalidCharError; 0xe0, 0x9f, 0xbf);
match_err_and_sequence!(InvalidCharError; 0xf0, 0x8f, 0xbf, 0xbf);
match_err_and_sequence!(LongSequenceError; 0xf8, 0x87, 0xbf, 0xbf, 0xbf);
match_err_and_sequence!(LongSequenceError; 0xfc, 0x83, 0xbf, 0xbf, 0xbf, 0xbf);
}
#[test]
fn _4_3_overlong_representation_of_the_nul_character() {
match_err_and_sequence!(InvalidCharError; 0xc0, 0x80);
match_err_and_sequence!(InvalidCharError; 0xe0, 0x80, 0x80);
match_err_and_sequence!(InvalidCharError; 0xf0, 0x80, 0x80, 0x80);
match_err_and_sequence!(LongSequenceError; 0xf8, 0x80, 0x80, 0x80, 0x80);
match_err_and_sequence!(LongSequenceError; 0xfc, 0x80, 0x80, 0x80, 0x80, 0x80);
}
#[test]
fn _5_illegal_code_positions() {
match_err_and_sequence!(InvalidCharError; 0xed, 0xa0, 0x80);
match_err_and_sequence!(InvalidCharError; 0xed, 0xad, 0xbf);
match_err_and_sequence!(InvalidCharError; 0xed, 0xae, 0x80);
match_err_and_sequence!(InvalidCharError; 0xed, 0xaf, 0xbf);
match_err_and_sequence!(InvalidCharError; 0xed, 0xb0, 0x80);
match_err_and_sequence!(InvalidCharError; 0xed, 0xbe, 0x80);
match_err_and_sequence!(InvalidCharError; 0xed, 0xbf, 0xbf);
match_err_and_sequence!(InvalidCharError; 0xed, 0xa0, 0x80);
match_err_and_sequence!(InvalidCharError; 0xed, 0xb0, 0x80);
match_err_and_sequence!(InvalidCharError; 0xed, 0xa0, 0x80);
match_err_and_sequence!(InvalidCharError; 0xed, 0xbf, 0xbf);
match_err_and_sequence!(InvalidCharError; 0xed, 0xad, 0xbf);
match_err_and_sequence!(InvalidCharError; 0xed, 0xb0, 0x80);
match_err_and_sequence!(InvalidCharError; 0xed, 0xad, 0xbf);
match_err_and_sequence!(InvalidCharError; 0xed, 0xbf, 0xbf);
match_err_and_sequence!(InvalidCharError; 0xed, 0xae, 0x80);
match_err_and_sequence!(InvalidCharError; 0xed, 0xb0, 0x80);
match_err_and_sequence!(InvalidCharError; 0xed, 0xae, 0x80);
match_err_and_sequence!(InvalidCharError; 0xed, 0xbf, 0xbf);
match_err_and_sequence!(InvalidCharError; 0xed, 0xaf, 0xbf);
match_err_and_sequence!(InvalidCharError; 0xed, 0xb0, 0x80);
match_err_and_sequence!(InvalidCharError; 0xed, 0xaf, 0xbf);
match_err_and_sequence!(InvalidCharError; 0xed, 0xbf, 0xbf);
match_err_and_sequence!(InvalidCharError; 0xef, 0xbf, 0xbe);
match_err_and_sequence!(InvalidCharError; 0xef, 0xbf, 0xbf);
}
#[test]
fn read_from_cursor() {
let stream = Cursor::new("来提供和改进网站体验ersé®þüúäåáßðfghjœøµñbv©xæ");
let mut chiter = Utf8Iterator::new(stream.bytes());
assert_eq!('来', chiter.next().unwrap().unwrap());
assert_eq!('提', chiter.next().unwrap().unwrap());
assert_eq!('供', chiter.next().unwrap().unwrap());
assert_eq!('和', chiter.next().unwrap().unwrap());
assert_eq!('改', chiter.next().unwrap().unwrap());
assert_eq!('进', chiter.next().unwrap().unwrap());
assert_eq!('网', chiter.next().unwrap().unwrap());
assert_eq!('站', chiter.next().unwrap().unwrap());
assert_eq!('体', chiter.next().unwrap().unwrap());
assert_eq!('验', chiter.next().unwrap().unwrap());
assert_eq!('e', chiter.next().unwrap().unwrap());
assert_eq!('r', chiter.next().unwrap().unwrap());
assert_eq!('s', chiter.next().unwrap().unwrap());
assert_eq!('é', chiter.next().unwrap().unwrap());
assert_eq!('®', chiter.next().unwrap().unwrap());
assert_eq!('þ', chiter.next().unwrap().unwrap());
assert_eq!('ü', chiter.next().unwrap().unwrap());
assert_eq!('ú', chiter.next().unwrap().unwrap());
assert_eq!('ä', chiter.next().unwrap().unwrap());
assert_eq!('å', chiter.next().unwrap().unwrap());
assert_eq!('á', chiter.next().unwrap().unwrap());
assert_eq!('ß', chiter.next().unwrap().unwrap());
assert_eq!('ð', chiter.next().unwrap().unwrap());
assert_eq!('f', chiter.next().unwrap().unwrap());
assert_eq!('g', chiter.next().unwrap().unwrap());
assert_eq!('h', chiter.next().unwrap().unwrap());
assert_eq!('j', chiter.next().unwrap().unwrap());
assert_eq!('œ', chiter.next().unwrap().unwrap());
assert_eq!('ø', chiter.next().unwrap().unwrap());
assert_eq!('µ', chiter.next().unwrap().unwrap());
assert_eq!('ñ', chiter.next().unwrap().unwrap());
assert_eq!('b', chiter.next().unwrap().unwrap());
assert_eq!('v', chiter.next().unwrap().unwrap());
assert_eq!('©', chiter.next().unwrap().unwrap());
assert_eq!('x', chiter.next().unwrap().unwrap());
assert_eq!('æ', chiter.next().unwrap().unwrap());
assert!(chiter.next().is_none());
}
#[test]
fn read_from_file() {
let mut file = tempfile().unwrap();
file.write_all("来提供和改进网站体验ersé®þüúäåáßðfghjœøµñbv©xæ".as_bytes())
.unwrap();
file.flush().unwrap();
file.seek(std::io::SeekFrom::Start(0)).unwrap();
let mut chiter = Utf8Iterator::new(file.bytes());
assert_eq!('来', chiter.next().unwrap().unwrap());
assert_eq!('提', chiter.next().unwrap().unwrap());
assert_eq!('供', chiter.next().unwrap().unwrap());
assert_eq!('和', chiter.next().unwrap().unwrap());
assert_eq!('改', chiter.next().unwrap().unwrap());
assert_eq!('进', chiter.next().unwrap().unwrap());
assert_eq!('网', chiter.next().unwrap().unwrap());
assert_eq!('站', chiter.next().unwrap().unwrap());
assert_eq!('体', chiter.next().unwrap().unwrap());
assert_eq!('验', chiter.next().unwrap().unwrap());
assert_eq!('e', chiter.next().unwrap().unwrap());
assert_eq!('r', chiter.next().unwrap().unwrap());
assert_eq!('s', chiter.next().unwrap().unwrap());
assert_eq!('é', chiter.next().unwrap().unwrap());
assert_eq!('®', chiter.next().unwrap().unwrap());
assert_eq!('þ', chiter.next().unwrap().unwrap());
assert_eq!('ü', chiter.next().unwrap().unwrap());
assert_eq!('ú', chiter.next().unwrap().unwrap());
assert_eq!('ä', chiter.next().unwrap().unwrap());
assert_eq!('å', chiter.next().unwrap().unwrap());
assert_eq!('á', chiter.next().unwrap().unwrap());
assert_eq!('ß', chiter.next().unwrap().unwrap());
assert_eq!('ð', chiter.next().unwrap().unwrap());
assert_eq!('f', chiter.next().unwrap().unwrap());
assert_eq!('g', chiter.next().unwrap().unwrap());
assert_eq!('h', chiter.next().unwrap().unwrap());
assert_eq!('j', chiter.next().unwrap().unwrap());
assert_eq!('œ', chiter.next().unwrap().unwrap());
assert_eq!('ø', chiter.next().unwrap().unwrap());
assert_eq!('µ', chiter.next().unwrap().unwrap());
assert_eq!('ñ', chiter.next().unwrap().unwrap());
assert_eq!('b', chiter.next().unwrap().unwrap());
assert_eq!('v', chiter.next().unwrap().unwrap());
assert_eq!('©', chiter.next().unwrap().unwrap());
assert_eq!('x', chiter.next().unwrap().unwrap());
assert_eq!('æ', chiter.next().unwrap().unwrap());
assert!(chiter.next().is_none());
}
#[test]
fn read_file_with_errors() {
let mut file = tempfile().unwrap();
let input: Vec<u8> = vec![
0xce,
0xba,
0xe1,
0xbd,
0xb9,
0xcf,
0x83,
0xce,
0xbc,
0xce,
0xb5,
0b1111_0000,
0b1000_0000,
0b1000_0000,
0xce,
0xba,
0xe1,
0xbd,
0xb9,
0xcf,
0x83,
0xce,
0xbc,
0xce,
0xb5,
0x80,
0xbf,
0xce,
0xba,
0xe1,
0xbd,
0xb9,
0xcf,
0x83,
0xce,
0xbc,
0xce,
0xb5,
0xf0,
0x80,
0x80,
0xaf,
0xed,
0xa0,
0x80,
0xce,
0xba,
0xe1,
0xbd,
0xb9,
0xcf,
0x83,
0xce,
0xbc,
0xce,
0xb5,
];
file.write_all(input.as_slice()).unwrap();
file.flush().unwrap();
file.seek(std::io::SeekFrom::Start(0)).unwrap();
let mut chiter = Utf8Iterator::new(file.bytes());
macro_rules! match_kosme {
($iter:ident) => {
assert_eq!('κ', chiter.next().unwrap().unwrap());
assert_eq!('ό', chiter.next().unwrap().unwrap());
assert_eq!('σ', chiter.next().unwrap().unwrap());
assert_eq!('μ', chiter.next().unwrap().unwrap());
assert_eq!('ε', chiter.next().unwrap().unwrap());
};
}
match_kosme!(chiter);
match_incomplete![ chiter; 0b1111_0000, 0b1000_0000, 0b1000_0000];
match_kosme!(chiter);
match_err_and_sequence!(chiter; InvalidSequenceError; 0x80);
match_err_and_sequence!(chiter; InvalidSequenceError; 0xbf);
match_kosme!(iter);
match_err_and_sequence!(chiter; InvalidCharError; 0xf0, 0x80, 0x80, 0xaf);
match_err_and_sequence!(chiter; InvalidCharError; 0xed, 0xa0, 0x80);
match_kosme!(chiter);
assert!(chiter.next().is_none());
}
#[test]
fn unget() {
let input: Vec<u8> = vec![
0xce, 0xba, 0xe1, 0xbd, 0xb9, 0xcf, 0x83, 0xce, 0xbc, 0xce, 0xb5,
];
let stream = Cursor::new(input);
let iter = stream.bytes();
let mut chiter = Utf8Iterator::new(iter);
assert_eq!('κ', chiter.next().unwrap().unwrap());
chiter.unget('ε');
assert_eq!('ε', chiter.next().unwrap().unwrap());
assert_eq!('ό', chiter.next().unwrap().unwrap());
assert_eq!('σ', chiter.next().unwrap().unwrap());
assert_eq!('μ', chiter.next().unwrap().unwrap());
assert_eq!('ε', chiter.next().unwrap().unwrap());
chiter.unget('κ');
assert_eq!('κ', chiter.next().unwrap().unwrap());
assert!(chiter.next().is_none());
}
#[test]
#[should_panic]
fn unget_panic() {
let input: Vec<u8> = vec![
0xce, 0xba, 0xe1, 0xbd, 0xb9, 0xcf, 0x83, 0xce, 0xbc, 0xce, 0xb5,
];
let stream = Cursor::new(input);
let iter = stream.bytes();
let mut chiter = Utf8Iterator::new(iter);
assert_eq!('κ', chiter.next().unwrap().unwrap());
chiter.unget('ε');
chiter.unget('κ');
}
#[test]
fn tokenizer() {
use std::io::Bytes;
enum Token {
None,
Identifier(String),
Integer(String),
OpenList,
CloseList,
Symbol(String),
Invalid(String),
}
enum State {
Begin,
DecodingIdentifier,
DecodingInteger,
FinishedToken,
Invalid,
}
impl PartialEq for Token {
fn eq(&self, other: &Self) -> bool {
use Token::*;
match (self, other) {
(None, None) => true,
(OpenList, OpenList) => true,
(CloseList, CloseList) => true,
(Identifier(a), Identifier(b)) => a == b,
(Integer(a), Integer(b)) => a == b,
(Symbol(a), Symbol(b)) => a == b,
(Invalid(a), Invalid(b)) => a == b,
(_, _) => false,
}
}
}
impl Clone for Token {
fn clone(&self) -> Self {
use Token::*;
match self {
None => None,
OpenList => OpenList,
CloseList => CloseList,
Identifier(a) => Identifier(a.to_string()),
Integer(a) => Integer(a.to_string()),
Symbol(a) => Symbol(a.to_string()),
Invalid(a) => Invalid(a.to_string()),
}
}
}
impl Debug for Token {
fn fmt(
&self,
f: &mut std::fmt::Formatter<'_>,
) -> std::result::Result<(), std::fmt::Error> {
use Token::*;
match self {
None => f.debug_struct("None").finish(),
OpenList => f.debug_struct("OpenList").finish(),
CloseList => f.debug_struct("CloseList").finish(),
Identifier(a) => f.debug_struct("Identifier").field("string", a).finish(),
Integer(a) => f.debug_struct("Integer").field("string", a).finish(),
Symbol(a) => f.debug_struct("Symbol").field("string", a).finish(),
Invalid(a) => f.debug_struct("Invalid").field("string", a).finish(),
}
}
}
let input = "(defun κόσμε (x y) (+ x y))";
let stream = Cursor::new(input);
let iter = stream.bytes();
let mut chiter = Utf8Iterator::new(iter);
let mut state = (State::Begin, Token::None);
fn state_machine(
chiter: &mut Utf8Iterator<Bytes<Cursor<&str>>>,
ch: char,
state: &(State, Token),
) -> (State, Token) {
match state {
(State::Invalid, _) | (State::FinishedToken, _) | (State::Begin, _) => {
if ch == '(' {
(State::FinishedToken, Token::OpenList)
} else if ch == ')' {
(State::FinishedToken, Token::CloseList)
} else if ch.is_whitespace() {
(State::Begin, Token::None)
} else if ch.is_alphabetic() || ch == '_' {
(State::DecodingIdentifier, Token::Identifier(ch.to_string()))
} else if ch.is_numeric() {
(State::DecodingInteger, Token::Integer(ch.to_string()))
} else if ch.is_ascii_punctuation() {
(State::FinishedToken, Token::Symbol(ch.to_string()))
} else {
(State::Invalid, Token::Invalid(ch.to_string()))
}
}
(State::DecodingIdentifier, Token::Identifier(id)) => {
if ch.is_whitespace() {
(State::FinishedToken, Token::Identifier(id.to_string()))
} else if ch.is_alphanumeric() || ch == '_' {
(
State::DecodingIdentifier,
Token::Identifier(id.to_string() + &ch.to_string()),
)
} else {
chiter.unget(ch);
(State::FinishedToken, Token::Identifier(id.to_string()))
}
}
(State::DecodingInteger, Token::Integer(num)) => {
if ch.is_whitespace() {
(State::FinishedToken, Token::Integer(num.to_string()))
} else if ch.is_digit(10) {
(
State::DecodingIdentifier,
Token::Integer(num.to_string() + &ch.to_string()),
)
} else {
chiter.unget(ch);
(
State::FinishedToken,
Token::Integer(num.to_string() + &ch.to_string()),
)
}
}
(_, _) => panic!("Inconsistent state!"),
}
};
fn next_token(
chiter: &mut Utf8Iterator<Bytes<Cursor<&str>>>,
state: &mut (State, Token),
) -> Option<Token> {
loop {
let r = chiter.next();
match r {
Some(item) => match item {
Ok(ch) => {
*state = state_machine(chiter, ch, &state);
if let State::FinishedToken = state.0 {
return Some(state.1.clone());
}
}
Err(e) => match e {
InvalidSequenceError(bytes) => {
panic!("Detected an invalid UTF-8 sequence! {:?}", bytes)
}
LongSequenceError(bytes) => {
panic!("UTF-8 sequence with more tha 4 bytes! {:?}", bytes)
}
InvalidCharError(bytes) => panic!(
"UTF-8 sequence resulted in an invalid character! {:?}",
bytes
),
IoError(ioe, bytes) => panic!(
"I/O error {:?} while decoding de sequence {:?} !",
ioe, bytes
),
},
},
None => {
return None;
}
}
}
};
macro_rules! test_token {
($exp:expr) => {
assert_eq!($exp, next_token(&mut chiter, &mut state).unwrap());
};
}
test_token!(Token::OpenList);
test_token!(Token::Identifier(String::from("defun")));
test_token!(Token::Identifier(String::from("κόσμε")));
test_token!(Token::OpenList);
test_token!(Token::Identifier(String::from("x")));
test_token!(Token::Identifier(String::from("y")));
test_token!(Token::CloseList);
test_token!(Token::OpenList);
test_token!(Token::Symbol(String::from("+")));
test_token!(Token::Identifier(String::from("x")));
test_token!(Token::Identifier(String::from("y")));
test_token!(Token::CloseList);
test_token!(Token::CloseList);
assert!(chiter.next().is_none());
}
}