1#![cfg_attr(feature="bench", feature(test))]
2
3#![deny(warnings)]
4#![allow(clippy::needless_doctest_main)]
5#![allow(clippy::needless_lifetimes)]
6#![doc(test(attr(deny(warnings))))]
7#![doc(test(attr(allow(dead_code))))]
8#![doc(test(attr(allow(unused_variables))))]
9
10#[cfg(all(feature="bench", test))]
11extern crate test;
12
13#[doc=include_str!("../README.md")]
14type _DocTestReadme = ();
15
16use std::fmt::{self};
17use std::char::{self};
18use std::error::{Error};
19use std::io::{self, BufRead};
20use arrayvec::{ArrayVec};
21
22#[derive(Debug)]
30pub struct ReadCharError {
31 bytes: ArrayVec<u8, { CHAR_MAX_LEN as usize }>,
32 io_error: io::Error,
33}
34
35impl ReadCharError {
36 pub fn as_bytes(&self) -> &[u8] { &self.bytes }
38 pub fn as_io_error(&self) -> &io::Error { &self.io_error }
40 pub fn into_io_error(self) -> io::Error { self.io_error }
42}
43
44impl Error for ReadCharError {
45 fn source(&self) -> Option<&(dyn Error + 'static)> { Some(&self.io_error) }
46}
47
48impl From<ReadCharError> for io::Error {
49 fn from(e: ReadCharError) -> io::Error { e.into_io_error() }
50}
51
52impl fmt::Display for ReadCharError {
53 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
54 write!(f, "invalid UTF-8 byte sequence")?;
55 for b in self.as_bytes() {
56 write!(f, " {b:02X}")?;
57 }
58 write!(f, " read")?;
59 match self.as_io_error().kind() {
60 io::ErrorKind::InvalidData => { },
61 io::ErrorKind::UnexpectedEof => { write!(f, " (unexpected EOF)")?; }
62 _ => { write!(f, " ({})", self.as_io_error())?; }
63 }
64 Ok(())
65 }
66}
67
68#[derive(Debug)]
78pub struct Chars<'a, T: BufRead + ?Sized>(&'a mut T);
79
80impl<'a, T: BufRead + ?Sized> Iterator for Chars<'a, T> {
81 type Item = io::Result<char>;
82
83 fn next(&mut self) -> Option<Self::Item> {
84 self.0.read_char_raw().map_err(|x| x.into_io_error()).transpose()
85 }
86}
87
88#[derive(Debug)]
93pub struct CharsRaw<'a, T: BufRead + ?Sized>(&'a mut T);
94
95impl<'a, T: BufRead + ?Sized> Iterator for CharsRaw<'a, T> {
96 type Item = Result<char, ReadCharError>;
97
98 fn next(&mut self) -> Option<Self::Item> {
99 self.0.read_char_raw().transpose()
100 }
101}
102
103const CHAR_MAX_LEN: u8 = 4;
104const LEAD_BYTE_MASK: [u8; CHAR_MAX_LEN as usize] = [0x7F, 0x1F, 0x0F, 0x07];
105const TAIL_BYTE_MASK: u8 = 0x3F;
106const TAIL_BYTE_SIGNATURE: u8 = 0x80;
107const TAIL_BYTE_BITS_COUNT: u8 = 6;
108const CHAR_MIN_VALUE: [u32; CHAR_MAX_LEN as usize] = [0, 0x80, 0x800, 0x10000];
109
110fn read_byte_and_ignore_interrupts(reader: &mut (impl BufRead + ?Sized)) -> io::Result<Option<u8>> {
111 loop {
112 match reader.fill_buf() {
113 Ok(buf) => return Ok(buf.first().copied()),
114 Err(e) => {
115 if e.kind() != io::ErrorKind::Interrupted {
116 return Err(e)
117 }
118 }
119 }
120 };
121}
122
123pub trait BufReadCharsExt : BufRead {
125 fn chars(&mut self) -> Chars<'_, Self> { Chars(self) }
135
136 fn chars_raw(&mut self) -> CharsRaw<'_, Self> { CharsRaw(self) }
141
142 fn read_char(&mut self) -> io::Result<Option<char>> {
158 self.read_char_raw().map_err(|x| x.into_io_error())
159 }
160
161 fn read_char_raw(&mut self) -> Result<Option<char>, ReadCharError> {
172 match read_byte_and_ignore_interrupts(self) {
173 Err(e) => Err(ReadCharError { bytes: ArrayVec::new(), io_error: e }),
174 Ok(None) => Ok(None),
175 Ok(Some(lead_byte)) => {
176 self.consume(1);
177 let leading_ones = lead_byte.leading_ones();
178 if leading_ones == 0 { return Ok(Some(char::from(lead_byte))); }
179 if leading_ones == 1 || leading_ones > 4 {
180 let mut bytes = ArrayVec::new();
181 bytes.push(lead_byte);
182 return Err(ReadCharError { bytes, io_error: io::Error::from(io::ErrorKind::InvalidData) });
183 }
184 let mut bytes = ArrayVec::new();
185 bytes.push(lead_byte);
186 let tail_bytes_count = (leading_ones - 1) as u8;
187 let mut item =
188 ((lead_byte & LEAD_BYTE_MASK[tail_bytes_count as usize]) as u32)
189 << (TAIL_BYTE_BITS_COUNT * tail_bytes_count)
190 ;
191 for tail_byte_index in (0 .. tail_bytes_count).rev() {
192 match read_byte_and_ignore_interrupts(self) {
193 Err(e) => return Err(ReadCharError { bytes, io_error: e }),
194 Ok(None) => return Err(ReadCharError { bytes, io_error: io::Error::from(io::ErrorKind::UnexpectedEof) }),
195 Ok(Some(tail_byte)) => {
196 if tail_byte & !TAIL_BYTE_MASK != TAIL_BYTE_SIGNATURE {
197 return Err(ReadCharError { bytes, io_error: io::Error::from(io::ErrorKind::InvalidData) });
198 }
199 bytes.push(tail_byte);
200 item |=
201 ((tail_byte & TAIL_BYTE_MASK) as u32)
202 << (tail_byte_index * TAIL_BYTE_BITS_COUNT)
203 ;
204 self.consume(1);
205 }
206 }
207 }
208 if item < CHAR_MIN_VALUE[tail_bytes_count as usize] {
209 return Err(ReadCharError { bytes, io_error: io::Error::from(io::ErrorKind::InvalidData) });
210 }
211 match char::from_u32(item) {
212 None => Err(ReadCharError { bytes, io_error: io::Error::from(io::ErrorKind::InvalidData) }),
213 Some(item) => Ok(Some(item))
214 }
215 }
216 }
217 }
218}
219
220impl<T: BufRead + ?Sized> BufReadCharsExt for T { }
221
222#[cfg(test)]
223mod tests {
224 use quickcheck_macros::quickcheck;
225 use std::io::{BufRead, BufReader, ErrorKind};
226 use crate::{BufReadCharsExt};
227
228 #[test]
229 fn read_valid_unicode() {
230 assert_eq!(vec!['A', 'B', 'c', 'd', ' ', 'А', 'Б', 'в', 'г', 'д', ' ', 'U', '\0'],
231 BufReader::new("ABcd АБвгд U\0".as_bytes()).chars_raw().map(|x| x.unwrap()).collect::<Vec<_>>());
232 }
233
234 #[test]
235 fn edgecase_one_two_bytes() {
236 assert_eq!(vec!['\x7F'],
237 BufReader::new(&[ 0x7F ][..]).chars_raw().map(|x| x.unwrap()).collect::<Vec<_>>());
238 assert_eq!(vec!['\u{0080}'],
239 BufReader::new(&[ 0xC2, 0x80 ][..]).chars_raw().map(|x| x.unwrap()).collect::<Vec<_>>());
240
241 let mut bytes = BufReader::new(&[ 0xC2 ][..]);
242 let res = bytes.chars_raw().collect::<Vec<_>>();
243 assert_eq!(1, res.len());
244 let err = res[0].as_ref().err().unwrap();
245 assert_eq!(&[0xC2][..], err.as_bytes());
246 assert_eq!(ErrorKind::UnexpectedEof, err.as_io_error().kind());
247
248 let mut bytes = BufReader::new(&[ 0xC1, 0xBF ][..]);
249 let res = bytes.chars_raw().collect::<Vec<_>>();
250 assert_eq!(1, res.len());
251 let err = res[0].as_ref().err().unwrap();
252 assert_eq!(&[0xC1, 0xBF][..], err.as_bytes());
253 assert_eq!(ErrorKind::InvalidData, err.as_io_error().kind());
254 }
255
256 #[test]
257 fn edgecase_two_three_bytes() {
258 assert_eq!(vec!['\u{07FF}'],
259 BufReader::new(&[ 0xDF, 0xBF ][..]).chars_raw().map(|x| x.unwrap()).collect::<Vec<_>>());
260 assert_eq!(vec!['\u{0800}'],
261 BufReader::new(&[ 0xE0, 0xA0, 0x80 ][..]).chars_raw().map(|x| x.unwrap()).collect::<Vec<_>>());
262
263 let mut bytes = BufReader::new(&[ 0xE0, 0xA0 ][..]);
264 let res = bytes.chars_raw().collect::<Vec<_>>();
265 assert_eq!(1, res.len());
266 let err = res[0].as_ref().err().unwrap();
267 assert_eq!(&[0xE0, 0xA0][..], err.as_bytes());
268 assert_eq!(ErrorKind::UnexpectedEof, err.as_io_error().kind());
269
270 let mut bytes = BufReader::new(&[ 0xE0, 0x9F, 0xBF ][..]);
271 let res = bytes.chars_raw().collect::<Vec<_>>();
272 assert_eq!(1, res.len());
273 let err = res[0].as_ref().err().unwrap();
274 assert_eq!(&[0xE0, 0x9F, 0xBF][..], err.as_bytes());
275 assert_eq!(ErrorKind::InvalidData, err.as_io_error().kind());
276 }
277
278 #[test]
279 fn edgecase_three_four_bytes() {
280 assert_eq!(vec!['\u{00FFFF}'],
281 BufReader::new(&[ 0xEF, 0xBF, 0xBF ][..]).chars_raw().map(|x| x.unwrap()).collect::<Vec<_>>());
282 assert_eq!(vec!['\u{010000}'],
283 BufReader::new(&[ 0xF0, 0x90, 0x80, 0x80 ][..]).chars_raw().map(|x| x.unwrap()).collect::<Vec<_>>());
284
285 let mut bytes = BufReader::new(&[ 0xF0, 0x90, 0x80 ][..]);
286 let res = bytes.chars_raw().collect::<Vec<_>>();
287 assert_eq!(1, res.len());
288 let err = res[0].as_ref().err().unwrap();
289 assert_eq!(&[0xF0, 0x90, 0x80][..], err.as_bytes());
290 assert_eq!(ErrorKind::UnexpectedEof, err.as_io_error().kind());
291
292 let mut bytes = BufReader::new(&[ 0xF0, 0x8F, 0xBF, 0xBF ][..]);
293 let res = bytes.chars_raw().collect::<Vec<_>>();
294 assert_eq!(1, res.len());
295 let err = res[0].as_ref().err().unwrap();
296 assert_eq!(&[0xF0, 0x8F, 0xBF, 0xBF][..], err.as_bytes());
297 assert_eq!(ErrorKind::InvalidData, err.as_io_error().kind());
298 }
299
300 #[test]
301 fn edgecase_four_bytes_max() {
302 assert_eq!(vec!['\u{10FFFF}'],
303 BufReader::new(&[ 0xF4, 0x8F, 0xBF, 0xBF ][..]).chars_raw().map(|x| x.unwrap()).collect::<Vec<_>>());
304 let mut bytes = BufReader::new(&[ 0xF8, 0x41 ][..]);
307 let res = bytes.chars_raw().collect::<Vec<_>>();
308 assert_eq!(2, res.len());
309 let err = res[0].as_ref().err().unwrap();
310 assert_eq!(&[0xF8][..], err.as_bytes());
311 assert_eq!(ErrorKind::InvalidData, err.as_io_error().kind());
312
313 let normal_char = res[1].as_ref().unwrap();
314 assert_eq!(&'A', normal_char);
315
316 assert_eq!(None, std::char::from_u32(0x00110000));
318 let mut bytes = BufReader::new(&[ 0xF4, 0x90, 0x80, 0x80 ][..]);
320 let res = bytes.chars_raw().collect::<Vec<_>>();
321 assert_eq!(1, res.len());
322 let err = res[0].as_ref().err().unwrap();
323 assert_eq!(&[0xF4, 0x90, 0x80, 0x80][..], err.as_bytes());
324 assert_eq!(ErrorKind::InvalidData, err.as_io_error().kind());
325 }
326
327 #[test]
328 fn read_io_valid_unicode() {
329 assert_eq!(vec!['A', 'B', 'c', 'd', ' ', 'А', 'Б', 'в', 'г', 'д', ' ', 'U', '\0'],
330 BufReader::new("ABcd АБвгд U\0".as_bytes()).chars().map(|x| x.unwrap()).collect::<Vec<_>>());
331 }
332
333 #[test]
334 fn read_valid_unicode_from_dyn_read() {
335 let bytes: &mut dyn BufRead = &mut BufReader::new("ABcd АБвгд UV".as_bytes());
336 assert_eq!(
337 vec!['A', 'B', 'c', 'd', ' ', 'А', 'Б', 'в', 'г', 'д', ' ', 'U', 'V'],
338 bytes.chars_raw().map(|x| x.unwrap()).collect::<Vec<_>>()
339 );
340 }
341
342 #[test]
343 fn do_not_take_extra_bytes() {
344 let mut bytes = BufReader::new("ABcd АБвгд UV".as_bytes());
345 assert_eq!(vec!['A', 'B', 'c', 'd'], bytes.chars_raw().take(4).map(|x| x.unwrap()).collect::<Vec<_>>());
346 assert_eq!(vec![' ', 'А', 'Б', 'в', 'г', 'д', ' ', 'U', 'V'], bytes.chars_raw().map(|x| x.unwrap()).collect::<Vec<_>>());
347 }
348
349 #[test]
350 fn read_value_out_of_range() {
351 let mut bytes = BufReader::new(&[ 0xF5, 0x8F, 0xBF, 0xBF ][..]);
352 let res = bytes.chars_raw().collect::<Vec<_>>();
353 assert_eq!(1, res.len());
354 let err = res[0].as_ref().err().unwrap();
355 assert_eq!(&[0xF5, 0x8F, 0xBF, 0xBF][..], err.as_bytes());
356 }
357
358 #[test]
359 fn read_io_value_out_of_range() {
360 let mut bytes = BufReader::new(&[ 0xF5, 0x8F, 0xBF, 0xBF ][..]);
361 let res = bytes.chars().collect::<Vec<_>>();
362 assert_eq!(1, res.len());
363 let err = res[0].as_ref().err().unwrap();
364 assert_eq!(ErrorKind::InvalidData, err.kind());
365 }
366
367 #[test]
368 fn read_io_incomplete_twobyte() {
369 let mut bytes = BufReader::new(&[ 0xC3 ][..]); let res = bytes.chars().collect::<Vec<_>>();
371 assert_eq!(1, res.len());
372 let err = res[0].as_ref().err().unwrap();
373 assert_eq!(ErrorKind::UnexpectedEof, err.kind());
374 }
375
376 #[test]
377 fn read_io_incomplete_threebyte() {
378 let mut bytes = BufReader::new(&[ 0xE1, 0xBA ][..]); let res = bytes.chars().collect::<Vec<_>>();
380 assert_eq!(1, res.len());
381 let err = res[0].as_ref().err().unwrap();
382 assert_eq!(ErrorKind::UnexpectedEof, err.kind());
383 }
384
385 #[test]
386 fn read_surrogate() {
387 let mut bytes = BufReader::new(&[ 0xED, 0xA0, 0x80 ][..]);
388 let res = bytes.chars_raw().collect::<Vec<_>>();
389 assert_eq!(1, res.len());
390 let err = res[0].as_ref().err().unwrap();
391 assert_eq!(&[0xED, 0xA0, 0x80][..], err.as_bytes());
392 }
393
394 #[test]
395 fn read_invalid_sequences() {
396 let mut bytes = BufReader::new(&[ 0x81, 0x82, 0xC1, 0x07, 0xC1, 0x87, 0xC2, 0xC2, 0x82, 0xF7, 0x88, 0x89, 0x07 ][..]);
397 let res = bytes.chars_raw().collect::<Vec<_>>();
398 assert_eq!(9, res.len());
399 assert_eq!(&[0x81][..], res[0].as_ref().err().unwrap().as_bytes());
400 assert_eq!(&[0x82][..], res[1].as_ref().err().unwrap().as_bytes());
401 assert_eq!(&[0xC1][..], res[2].as_ref().err().unwrap().as_bytes());
402 assert_eq!('\x07', *res[3].as_ref().unwrap());
403 assert_eq!(&[0xC1, 0x87][..], res[4].as_ref().err().unwrap().as_bytes());
404 assert_eq!(&[0xC2][..], res[5].as_ref().err().unwrap().as_bytes());
405 assert_eq!('\u{82}', *res[6].as_ref().unwrap());
406 assert_eq!(&[0xF7, 0x88, 0x89][..], res[7].as_ref().err().unwrap().as_bytes());
407 assert_eq!('\x07', *res[8].as_ref().unwrap());
408 }
409
410 #[quickcheck]
411 fn read_string(s: String) -> bool {
412 let mut t = String::new();
413 BufReader::new(s.as_bytes()).chars_raw().for_each(|c| t.push(c.unwrap()));
414 s == t
415 }
416
417 #[quickcheck]
418 fn read_array(b: Vec<u8>) -> bool {
419 let mut t = Vec::new();
420 BufReader::new(&b[..]).chars_raw().for_each(|c|
421 t.append(&mut c.map_or_else(|e| e.as_bytes().to_vec(), |s| s.to_string().as_bytes().to_vec()))
422 );
423 b == t
424 }
425}
426
427#[cfg(all(feature="bench", test))]
428mod benchs {
429 use rand::distributions::{Distribution, Uniform};
430 use rand::thread_rng;
431 use std::hint::black_box;
432 use std::io::BufReader;
433 use std::vec::{Vec};
434 use test::Bencher;
435 use crate::{BufReadCharsExt};
436
437 #[bench]
438 fn read_array_bench(b: &mut Bencher) {
439 let mut rng = thread_rng();
440 let mut bytes: Vec<u8> = Uniform::new_inclusive(0u8, 255u8).sample_iter(&mut rng).take(10000).collect();
441 b.iter(move || {
442 black_box(&mut bytes);
443 black_box(BufReader::new(&bytes[..]).chars_raw().last());
444 });
445 }
446}