1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124
#![allow(non_shorthand_field_patterns)] extern crate arrayvec; pub mod utf_chars { use std::{fmt}; use std::char::{self}; use std::io::{self, BufRead}; use arrayvec::{ArrayVec}; #[derive(Debug)] pub struct Utf8Chars<'a, T: BufRead + ?Sized>(&'a mut T); #[derive(Debug)] pub struct Utf8CharsError(ArrayVec<[u8; UTF8_SEQUENCE_MAX_LENGTH as usize]>); impl Utf8CharsError { pub fn as_bytes(&self) -> &[u8] { &self.0 } pub fn into_bytes(self) -> ArrayVec<[u8; UTF8_SEQUENCE_MAX_LENGTH as usize]> { self.0 } } impl fmt::Display for Utf8CharsError { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "invalid UTF-8 byte sequence")?; for b in &self.0 { write!(f, " {:02X}", b)?; } Ok(()) } } const UTF8_SEQUENCE_MAX_LENGTH: u8 = 4; const LEADING_BYTE_MASK: [u8; UTF8_SEQUENCE_MAX_LENGTH as usize] = [0x80, 0xE0, 0xF0, 0xF8]; const LEADING_BYTE_PATTERN: [u8; UTF8_SEQUENCE_MAX_LENGTH as usize] = [0x00, 0xC0, 0xE0, 0xF0]; const TAIL_BYTE_MASK: u8 = 0xC0; const TAIL_BYTE_PATTERN: u8 = 0x80; const TAIL_BYTE_VALUE_BITS: u8 = 6; fn to_utf8(item: u32, expected_tail_bytes_count: u8, actual_tail_bytes_count: u8) -> ArrayVec<[u8; UTF8_SEQUENCE_MAX_LENGTH as usize]> { let mut res = ArrayVec::new(); let leading_byte = LEADING_BYTE_PATTERN[expected_tail_bytes_count as usize] | ((item >> (TAIL_BYTE_VALUE_BITS * expected_tail_bytes_count)) as u8) & !LEADING_BYTE_MASK[expected_tail_bytes_count as usize]; res.push(leading_byte); for tail_byte_index in 0..actual_tail_bytes_count { res.push(TAIL_BYTE_PATTERN | ((item >> ((expected_tail_bytes_count - 1 - tail_byte_index) * TAIL_BYTE_VALUE_BITS)) as u8) & !TAIL_BYTE_MASK); } res } impl<'a, T: BufRead> Iterator for Utf8Chars<'a, T> { type Item = Result<char, (Utf8CharsError, Option<io::Error>)>; fn next(&mut self) -> Option<Self::Item> { match self.0.fill_buf() { Err(e) => return Some(Err((Utf8CharsError(ArrayVec::new()), Some(e)))), Ok(buf) => { if buf.is_empty() { return None; } let leading_byte = buf[0]; self.0.consume(1); let tail_bytes_count = 'r: loop { for i in 0..UTF8_SEQUENCE_MAX_LENGTH { if leading_byte & LEADING_BYTE_MASK[i as usize] == LEADING_BYTE_PATTERN[i as usize] { break 'r i; } } let mut bytes = ArrayVec::new(); bytes.push(leading_byte); return Some(Err((Utf8CharsError(bytes), None))); }; let mut item = ((leading_byte & !LEADING_BYTE_MASK[tail_bytes_count as usize]) as u32) << (TAIL_BYTE_VALUE_BITS * tail_bytes_count); for tail_byte_index in 0..tail_bytes_count { match self.0.fill_buf() { Err(e) => return Some(Err((Utf8CharsError(to_utf8(item, tail_bytes_count, tail_byte_index)), Some(e)))), Ok(buf) => { if buf.is_empty() || buf[0] & TAIL_BYTE_MASK != TAIL_BYTE_PATTERN { return Some(Err((Utf8CharsError(to_utf8(item, tail_bytes_count, tail_byte_index)), None))); } item |= ((buf[0] & !TAIL_BYTE_MASK) as u32) << ((tail_bytes_count - 1 - tail_byte_index) * TAIL_BYTE_VALUE_BITS); self.0.consume(1); } } } match char::from_u32(item) { None => Some(Err((Utf8CharsError(to_utf8(item, tail_bytes_count, tail_bytes_count)), None))), Some(item) => Some(Ok(item)) } } } } } pub trait ReadChars : BufRead { fn utf8_chars<'a>(&'a mut self) -> Utf8Chars<'a, Self>; } impl<T: BufRead> ReadChars for T { fn utf8_chars<'a>(&'a mut self) -> Utf8Chars<'a, Self> { Utf8Chars(self) } } #[cfg(test)] mod tests { use std::io::{BufRead, BufReader}; use std::vec::{Vec}; use crate::utf_chars::{ReadChars}; #[test] fn read_valid_unicode() { assert_eq!(vec!['A', 'B', 'c', 'd', ' ', 'А', 'Б', 'в', 'г', 'д', ' ', 'U', 'V'], BufReader::new("ABcd АБвгд UV".as_bytes()).utf8_chars().map(|x| x.unwrap()).collect::<Vec<_>>()); } #[test] fn read_valid_unicode_from_dyn_read() { let mut bytes: &mut dyn BufRead = &mut BufReader::new("ABcd АБвгд UV".as_bytes()); assert_eq!(vec!['A', 'B', 'c', 'd', ' ', 'А', 'Б', 'в', 'г', 'д', ' ', 'U', 'V'], bytes.utf8_chars().map(|x| x.unwrap()).collect::<Vec<_>>()); } #[test] fn do_not_take_extra_bytes() { let mut bytes = BufReader::new("ABcd АБвгд UV".as_bytes()); assert_eq!(vec!['A', 'B', 'c', 'd'], bytes.utf8_chars().take(4).map(|x| x.unwrap()).collect::<Vec<_>>()); assert_eq!(vec![' ', 'А', 'Б', 'в', 'г', 'д', ' ', 'U', 'V'], bytes.utf8_chars().map(|x| x.unwrap()).collect::<Vec<_>>()); } #[test] fn read_value_out_of_range() { let mut bytes = BufReader::new(&[ 0xF5, 0x8F, 0xBF, 0xBF ][..]); let res = bytes.utf8_chars().collect::<Vec<_>>(); assert_eq!(1, res.len()); let err = res[0].as_ref().err().unwrap(); assert_eq!(&[0xF5, 0x8F, 0xBF, 0xBF][..], err.0.as_bytes()); } } }