#![feature(option_result_unwrap_unchecked)]
#[deny(missing_crate_level_docs, missing_docs, missing_doc_code_examples)]
mod error;
use error::Result;
use std::borrow::Cow;
use std::io::{self, ErrorKind};
use std::rc::Rc;
use std::slice::from_raw_parts;
use std::str::{from_utf8, from_utf8_unchecked, FromStr};
pub use error::Error;
pub trait BufRead: io::BufRead {
fn read_str(&mut self) -> Result<Cow<str>> {
let read_bytes = self.fill_buf()?;
let read_len = read_bytes.len();
if read_len == 0 {
return Ok(Cow::from(""));
}
let ptr = read_bytes.as_ptr();
match from_utf8(read_bytes) {
Ok(_) => {
self.consume(read_len);
Ok(Cow::from(unsafe {
from_utf8_unchecked(from_raw_parts(ptr, read_len))
}))
}
Err(e) => {
let len = e.valid_up_to();
if len != 0 {
self.consume(len);
Ok(Cow::from(unsafe {
from_utf8_unchecked(from_raw_parts(ptr, len))
}))
} else if read_len >= codepoint_length(read_bytes[0]) {
Err(Error::from(e))
} else {
self.consume(read_len);
read_across_boundary(self, Vec::from(unsafe { from_raw_parts(ptr, read_len) }))
}
}
}
}
#[doc(hidden)]
fn read_codepoint(&mut self) -> Result<Cow<str>> {
let read_bytes = self.fill_buf()?;
let read_len = read_bytes.len();
if read_len == 0 {
return Ok(Cow::from(""));
}
let ptr = read_bytes.as_ptr();
let len = codepoint_length(read_bytes[0]);
if read_len < len {
self.consume(read_len);
read_across_boundary(self, Vec::from(unsafe { from_raw_parts(ptr, read_len) }))
} else {
match from_utf8(&read_bytes[..len]) {
Ok(_) => {
self.consume(len);
Ok(Cow::from(unsafe {
from_utf8_unchecked(from_raw_parts(ptr, len))
}))
}
Err(e) => Err(Error::from(e)),
}
}
}
fn read_char(&mut self) -> Result<char> {
let c = self.read_codepoint()?;
if c.is_empty() {
return Ok('\0');
}
Ok(unsafe { char::from_str(c.as_ref()).unwrap_unchecked() })
}
fn str_iter(&mut self) -> StrIter<'_, Self> {
let default_cap = 8 * 1024;
StrIter {
reader: self,
buf: Rc::new(String::with_capacity(default_cap)),
default_cap,
ended: false,
}
}
#[doc(hidden)]
fn codepoint_iter(&mut self) -> CodepointIter<'_, Self> {
let default_cap = 4;
CodepointIter {
reader: self,
buf: Rc::new(String::with_capacity(default_cap)),
default_cap,
ended: false,
}
}
fn char_iter(&mut self) -> CharIter<'_, Self> {
CharIter {
reader: self,
ended: false,
}
}
fn str_map<F, T>(&mut self, f: F) -> StrMap<'_, Self, F>
where
F: FnMut(Cow<str>) -> T,
{
StrMap {
reader: self,
map: Rc::new(f),
ended: false,
}
}
#[doc(hidden)]
fn codepoint_map<F, T>(&mut self, f: F) -> CodepointMap<'_, Self, F>
where
F: FnMut(Cow<str>) -> T,
{
CodepointMap {
reader: self,
map: Rc::new(f),
ended: false,
}
}
}
impl<R: io::BufRead> BufRead for R {}
pub struct StrIter<'r, R>
where
R: ?Sized,
{
reader: &'r mut R,
buf: Rc<String>,
default_cap: usize,
ended: bool,
}
impl<R> Iterator for StrIter<'_, R>
where
R: io::BufRead,
{
type Item = Result<Rc<String>>;
fn next(&mut self) -> Option<Self::Item> {
if self.ended {
return None;
}
let buf = match Rc::get_mut(&mut self.buf) {
None => {
self.buf = Rc::new(String::with_capacity(self.default_cap));
Rc::make_mut(&mut self.buf)
}
Some(buf) => {
buf.clear();
buf
}
};
loop {
match self.reader.read_str() {
Err(e) => {
if let ErrorKind::Interrupted = e.kind() {
continue;
}
self.ended = true;
break Some(Err(e));
}
Ok(s) => {
if s.is_empty() {
self.ended = true;
break None;
} else {
buf.push_str(s.as_ref());
break Some(Ok(Rc::clone(&self.buf)));
}
}
}
}
}
}
#[doc(hidden)]
pub struct CodepointIter<'r, R>
where
R: ?Sized,
{
reader: &'r mut R,
buf: Rc<String>,
default_cap: usize,
ended: bool,
}
impl<R> Iterator for CodepointIter<'_, R>
where
R: io::BufRead,
{
type Item = Result<Rc<String>>;
fn next(&mut self) -> Option<Self::Item> {
if self.ended {
return None;
}
let buf = match Rc::get_mut(&mut self.buf) {
None => {
self.buf = Rc::new(String::with_capacity(self.default_cap));
Rc::make_mut(&mut self.buf)
}
Some(buf) => {
buf.clear();
buf
}
};
loop {
match self.reader.read_codepoint() {
Err(e) => {
if let ErrorKind::Interrupted = e.kind() {
continue;
}
self.ended = true;
break Some(Err(e));
}
Ok(s) => {
if s.is_empty() {
self.ended = true;
break None;
} else {
buf.push_str(s.as_ref());
break Some(Ok(Rc::clone(&self.buf)));
}
}
}
}
}
}
pub struct StrMap<'r, R, F>
where
R: ?Sized,
{
reader: &'r mut R,
map: Rc<F>,
ended: bool,
}
impl<R, F, T> Iterator for StrMap<'_, R, F>
where
R: io::BufRead,
F: FnMut(Cow<str>) -> T,
{
type Item = Result<T>;
fn next(&mut self) -> Option<Self::Item> {
if self.ended {
return None;
}
loop {
match self.reader.read_str() {
Ok(s) => {
if s.is_empty() {
self.ended = true;
break None;
} else {
break Some(Ok((Rc::get_mut(&mut self.map)
.expect("MappingIter's mapping function cannot be shared !"))(
s
)));
}
}
Err(e) => {
if let ErrorKind::Interrupted = e.kind() {
continue;
}
self.ended = true;
break Some(Err(e));
}
}
}
}
}
#[doc(hidden)]
pub struct CodepointMap<'r, R, F>
where
R: ?Sized,
{
reader: &'r mut R,
map: Rc<F>,
ended: bool,
}
impl<R, F, T> Iterator for CodepointMap<'_, R, F>
where
R: io::BufRead,
F: FnMut(Cow<str>) -> T,
{
type Item = Result<T>;
fn next(&mut self) -> Option<Self::Item> {
if self.ended {
return None;
}
loop {
match self.reader.read_codepoint() {
Ok(s) => {
if s.is_empty() {
self.ended = true;
break None;
} else {
break Some(Ok((Rc::get_mut(&mut self.map)
.expect("MappingIter's mapping function cannot be shared !"))(
s
)));
}
}
Err(e) => {
if let ErrorKind::Interrupted = e.kind() {
continue;
}
self.ended = true;
break Some(Err(e));
}
}
}
}
}
pub struct CharIter<'r, R>
where
R: ?Sized,
{
reader: &'r mut R,
ended: bool,
}
impl<R> Iterator for CharIter<'_, R>
where
R: io::BufRead,
{
type Item = Result<char>;
fn next(&mut self) -> Option<Self::Item> {
if self.ended {
return None;
}
match self.reader.read_char() {
Ok(c) => {
if c == '\0' {
self.ended = true;
None
} else {
Some(Ok(c))
}
}
Err(e) => {
self.ended = true;
Some(Err(e))
}
}
}
}
fn read_across_boundary<R>(reader: &mut R, mut leftovers: Vec<u8>) -> Result<Cow<str>>
where
R: io::BufRead + ?Sized,
{
debug_assert!(!leftovers.is_empty());
let len = codepoint_length(leftovers[0]);
let first_read_len = leftovers.len();
debug_assert!(len > first_read_len);
let additional_len = (len - first_read_len) as usize;
let additional_bytes = &reader.fill_buf()?;
if additional_bytes.len() < additional_len {
return Err(Error::from(ErrorKind::UnexpectedEof).with_leftovers(leftovers));
}
leftovers.extend_from_slice(&additional_bytes[..additional_len]);
reader.consume(additional_len);
match String::from_utf8(leftovers) {
Ok(s) => Ok(Cow::from(s)),
Err(e) => Err(Error::from(e)),
}
}
#[inline]
fn codepoint_length(x: u8) -> usize {
if x < 0x80 {
1
} else if x < 0xE0 {
2
} else if x < 0xF0 {
3
} else {
4
}
}
#[cfg(test)]
mod read_str_tests {
use crate::BufRead;
use std::io::{BufReader, Cursor, ErrorKind};
use std::str::Utf8Error;
use std::string::FromUtf8Error;
#[test]
fn empty_read() {
let mut r = Cursor::new("");
let s = r.read_str();
assert!(s.is_ok());
let s = s.unwrap();
assert!(s.is_empty());
}
#[test]
fn invalid_in_buffer() {
let mut r = Cursor::new([0x9fu8, 0x92, 0x96, 0x0]);
let e = r.read_str();
assert!(e.is_err());
let e = e.unwrap_err();
assert_eq!(e.kind(), ErrorKind::InvalidData);
let e = e.into_inner_checked();
assert!(e.is_ok());
let e = e.unwrap();
assert!(e.is_some());
let e = e.unwrap();
assert!(e.is::<Utf8Error>());
}
#[test]
fn incomplete_in_buffer() {
let mut r = Cursor::new(&"💖".as_bytes()[..3]);
let e = r.read_str();
assert!(e.is_err());
let e = e.unwrap_err();
assert_eq!(e.kind(), ErrorKind::UnexpectedEof);
assert!(!e.leftovers().is_empty());
let e = e.into_inner_lossy();
assert!(e.is_none());
}
#[test]
fn invalid_across_boundary() {
let mut r = BufReader::<&[u8]>::with_capacity(2, [0xffu8, 0x92, 0x96, 0x0].as_ref());
let e = r.read_str();
assert!(e.is_err());
let e = e.unwrap_err();
assert_eq!(e.kind(), ErrorKind::InvalidData);
assert!(!e.leftovers().is_empty());
let e = e.into_inner_lossy();
assert!(e.is_some());
let e = e.unwrap();
assert!(e.is::<FromUtf8Error>());
}
#[test]
fn incomplete_across_boundary() {
let mut r = BufReader::<&[u8]>::with_capacity(2, &"💖".as_bytes()[..3]);
let e = r.read_str();
assert!(e.is_err());
let e = e.unwrap_err();
assert_eq!(e.kind(), ErrorKind::UnexpectedEof);
let e = e.into_inner_lossy();
assert!(e.is_none());
}
#[test]
fn complete_successful_read() {
let mut r = Cursor::new("💖");
let s = r.read_str();
assert!(s.is_ok());
let s = s.unwrap();
assert_eq!(s, "💖");
}
#[test]
fn incomplete_successful_read() {
let mut r = Cursor::new([0x6fu8, 0xa, 0x9f, 0x92, 0x96, 0x0]);
let s = r.read_str();
assert!(s.is_ok());
let s = s.unwrap();
assert_eq!(s, "o\n");
}
#[test]
fn read_across_boundary() {
let mut r = BufReader::<&[u8]>::with_capacity(2, "💖".as_ref());
let s = r.read_str();
assert!(s.is_ok());
let s = s.unwrap();
assert_eq!(s, "💖");
}
#[test]
fn multi_codepoints_read() {
let mut r = Cursor::new("foo💖bär€");
let s = r.read_str();
assert!(s.is_ok());
let s = s.unwrap();
assert_eq!(s, "foo💖bär€");
let s = r.read_str();
assert!(s.is_ok());
let s = s.unwrap();
assert_eq!(s, "");
}
}
#[cfg(test)]
mod buf_too_small_tests {
macro_rules! buf_too_small_test {
($name:ident $cap:literal $input:literal: success) => {
#[test]
fn $name() {
let mut r = BufReader::<&[u8]>::with_capacity($cap, $input.as_bytes());
let mut call_count = 0;
loop {
let s = r.read_str();
assert!(s.is_ok());
let s = s.unwrap();
if s.is_empty() {
break;
} else {
call_count += 1;
}
}
assert_ne!(call_count, 0);
}
};
($name:ident $cap:literal $input:literal: failure) => {
#[test]
fn $name() {
let mut r = BufReader::<&[u8]>::with_capacity($cap, $input.as_bytes());
loop {
let e = r.read_str();
match e {
Ok(s) => {
assert!(!s.is_empty());
}
Err(e) => {
assert_eq!(e.kind(), ErrorKind::UnexpectedEof);
assert!(!e.leftovers().is_empty());
let e = e.into_inner_lossy();
assert!(e.is_none());
break;
}
}
}
}
};
}
mod buf_capacity_1 {
use crate::BufRead;
use std::io::{BufReader, ErrorKind};
buf_too_small_test!(codepoint_length_1_offset_0 1 "f": success);
buf_too_small_test!(codepoint_length_2_offset_0 1 "ä": success);
buf_too_small_test!(codepoint_length_3_offset_0 1 "€": failure);
buf_too_small_test!(codepoint_length_4_offset_0 1 "💖": failure);
}
mod buf_capacity_2 {
use crate::BufRead;
use std::io::{BufReader, ErrorKind};
buf_too_small_test!(codepoint_length_1_offset_0 2 "f": success);
buf_too_small_test!(codepoint_length_2_offset_0 2 "ä": success);
buf_too_small_test!(codepoint_length_2_offset_1 2 "xä": success);
buf_too_small_test!(codepoint_length_3_offset_0 2 "€": success);
buf_too_small_test!(codepoint_length_3_offset_1 2 "x€": success);
buf_too_small_test!(codepoint_length_4_offset_0 2 "💖": success);
buf_too_small_test!(codepoint_length_4_offset_1 2 "x💖": failure);
}
mod buf_capacity_3 {
use crate::BufRead;
use std::io::BufReader;
buf_too_small_test!(codepoint_length_1_offset_0 3 "f": success);
buf_too_small_test!(codepoint_length_2_offset_0 3 "ä": success);
buf_too_small_test!(codepoint_length_2_offset_1 3 "xä": success);
buf_too_small_test!(codepoint_length_3_offset_0 3 "€": success);
buf_too_small_test!(codepoint_length_3_offset_1 3 "x€": success);
buf_too_small_test!(codepoint_length_3_offset_2 3 "xx€": success);
buf_too_small_test!(codepoint_length_4_offset_0 3 "💖": success);
buf_too_small_test!(codepoint_length_4_offset_1 3 "x💖": success);
buf_too_small_test!(codepoint_length_4_offset_2 3 "xx💖": success);
}
}