//! This crate provides a trait, [`BufRead`], providing functions to read utf-8 text streams
//! using an [`io::BufRead`] without waiting for newline delimiters.
//!
//! # Quick Start
//!
//! The simplest way to read a whole file with a [`BufRead`] type is to repeatedly calling its
//! [`read_utf8`] method:
//!
//! ```
//! use utf8_bufread::BufRead;
//! use std::io::BufReader;
//!
//! // Reader may be any type implementing io::BufRead
//! // We'll just use a BufReader wrapping a slice for this example
//! let mut reader = BufReader::<&[u8]>::new("💖".as_ref());
//! // The string we'll use to store the text of the read file
//! let mut text = String::new();
//! loop { // Loop until EOF
//! match reader.read_utf8(&mut text) {
//! Ok(0) => break, // EOF
//! Ok(_) => continue,
//! Err(e) => panic!(e), // io::Error or Utf8Error
//! }
//! }
//! assert_eq!("💖", text.as_str());
//! ```
//!
//! *Note that this example does exactly what this crate tries to avoid: storing the whole file in
//! memory.*
//!
//! see [`BufRead`]'s documentation for more.
//!
//! [`BufRead`]: self::BufRead
//! [`io::BufRead`]: std::io::BufRead
//! [`read_utf8`]: self::BufRead::read_utf8
use std::io::{self, Error, ErrorKind};
use std::str::{from_utf8, from_utf8_unchecked};
#[deny(missing_crate_level_docs, missing_docs, missing_doc_code_examples)]
/// A trait implemented for all types implementing [`io::BufRead`], providing functions to
/// read utf-8 text streams without waiting for newline delimiters.
///
/// [`io::BufRead`]: std::io::BufRead
pub trait BufRead: io::BufRead {
/// Read a number of bytes less than or equal to the [`capacity`] of the its buffer, and push
/// their utf-8 representation in the provided `buf`. It returns the number of bytes read as a
/// [`io::Result`]`<`[`usize`]`>`.
///
/// This function will read bytes from the underlying stream until its buffer is full, an
/// invalid or incomplete codepoint is found, or EOF is found. Once found, all codepoints
/// up to, including the EOF (if found), but not including the invalid or incomplete codepoint
/// (if found), will be appended to the provided `buf`.
///
/// If the operation is successful, this function resturns the number of bytes read. Note this
/// may **not** be the number of [`char`]s read, as UTF-8 is a variable-length encoding.
///
/// If this function returns [`Ok(0)`], the stream has reached EOF.
///
/// This function avoids the usual issues of using [`BufRead`]`::`[`read_line`]`(&self, &mut `
/// [`String`]`)` or [`BufRead`]`::`[`lines`]`(&self)` on big text file without newline
/// delimiters: It will not load the whole file in memory.
///
/// [`capacity`]: std::io::BufRead::capacity
/// [`io::Result`]: std::io::Result
/// [`Ok("")`]: Ok
/// [`BufRead`]: std::io::BufRead
/// [`read_line`]: std::io::BufRead::read_line
/// [`lines`]: std::io::BufRead::lines
///
/// # Errors
///
/// This function will immediately return any errors returned by [`fill_buf`].
///
/// If an [`Utf8Error`] is returned by the internal call to [`from_utf8`], all valid codepoints
/// are returned, and no error is returned, unless no valid codepoints were read. This
/// allows not to lose any valid data, and the error will be returned on the next call.
///
/// If the first codepoint encountered by [`from_utf8`] is invalid or incomplete, an
/// [`ErrorKind`]`::`[`InvalidData`] caused by an [`Utf8Error`] is returned. This error cannot
/// be recovered from, and you will have to read bytes manually to determine if the error was
/// caused by an invalid codepoint in middle of the file or by an incomplete codepoint because
/// of an early EOF.
///
/// [`fill_buf`]: std::io::BufRead::fill_buf
/// [`Utf8Error`]: std::str::Utf8Error
/// [`from_utf8`]: std::str::from_utf8
/// [`ErrorKind`]: std::io::ErrorKind
/// [`InvalidData`]: std::io::ErrorKind::InvalidData
///
/// # Examples
///
/// ```
/// use utf8_bufread::BufRead;
/// use std::io::{BufReader, ErrorKind};
///
/// // "foo\nbar" + some invalid bytes
/// // We give the buffer more than enough capacity to be able to read all the bytes in one
/// // call
/// let mut reader = BufReader::with_capacity(
/// 16,
/// [0x66u8, 0x6f, 0x6f, 0xa, 0x62, 0x61, 0x72, 0x9f, 0x92, 0x96].as_ref(),
/// );
/// let mut buf = String::new();
///
/// // On the first read_utf8() call, we will read up to the first byte of the invalid
/// // codepoint (ie "foo\nbar")
/// let n_read = reader
/// .read_utf8(&mut buf)
/// .expect("We will get all the valid bytes without error");
/// assert_eq!("foo\nbar", buf.as_str());
/// assert_eq!(7, n_read);
///
/// // Then on the second call we will get the InvalidData error caused by the Utf8Error error,
/// // as there is no bytes forming valid codepoints left
/// let read_err = reader.read_utf8(&mut buf).expect_err("We will get an error");
/// assert_eq!(ErrorKind::InvalidData, read_err.kind());
/// assert_eq!(7, buf.len()); // no byte appended to buf
/// ```
fn read_utf8(&mut self, buf: &mut String) -> io::Result<usize> {
// Fill the buffer from inner reader's data and get its content
let read_bytes = match self.fill_buf() {
Ok(r) => r,
// We do not handle `ErrorKind::Interrupt`
Err(e) => return Err(e),
};
// We attempt converting read bytes to utf8
match from_utf8(read_bytes) {
Ok(s) => {
let used = read_bytes.len();
buf.push_str(s);
self.consume(used);
Ok(used)
}
Err(e) => {
// If we have an error, we will first attempt to return all valid read bytes,
// putting the invalid or incomplete codepoint at the beginning of the buffer.
// This allows us to recover from reading up to a byte that isn't on a char
// boundary by reading the complete codepoint on the next call
let used = e.valid_up_to();
if used == 0 {
// If we cannot decode any valid utf8 byte from the buffer, it either means
// - We reached EOF with an incomplete codepoint, we should return an
// Utf8Error
// - There was a parse error earlier, and we read everything up to this
// point in a previous read call, there is two possible situations again:
// - There is more than 2 bytes following the first byte of the invalid
// slice, this means there truly is an invalid codepoint, we should
// return an Utf8Error
// - There is less than 4 bytes left in the buffer, meaning we may have
// an incomplete codepoint and need to read up to 3 bytes further.
if read_bytes.len() < 4 {
let mut v = Vec::from(read_bytes);
// Consume the last bytes, so that the next call to `fill_buff` will read
// more bytes from the underlying stream
self.consume(v.len());
// Let's try reading more bytes
let additional_bytes = match self.fill_buf() {
Ok(r) => r,
// We do not handle `ErrorKind::Interrupt`
Err(e) => return Err(e),
};
if additional_bytes.len() == 0 {
// No additional bytes, we reached EOF on an incomplete codepoint
return Err(Error::from(ErrorKind::InvalidData));
} else if additional_bytes.len() + v.len() < 4 {
// If this is true the following for loop *will* panic because of
// an index out of bound
// This means our buffer is only 1 byte long ! This doesn't sound
// plausible, but we never know
return Err(Error::new(
ErrorKind::InvalidInput,
format!(
"Internal buffer capacity of at least 2 bytes expected to be \
able to read utf-8, but it is: {}",
additional_bytes.len()
),
));
}
// Try adding bytes until our incomplete codepoint is complete, up to 3
// (we know that v.len() < 4)
for i in 0..(4 - v.len()) {
v.push(additional_bytes[i]);
match from_utf8(v.as_slice()) {
Ok(s) => {
// Hurray, we got a valid codepoint
buf.push_str(s);
// Don't forget to tell BufRead we consumed those bytes
self.consume(i + 1);
return Ok(v.len());
}
Err(_) => {} // ignore fails, we will return an error below
}
}
}
// We couldn't get a valid codepoint, return Utf8Error
return Err(Error::new(ErrorKind::InvalidData, e));
}
// This is safe, see `Utf8Error::valid_up_to(&self)` doc
buf.push_str(unsafe { from_utf8_unchecked(&read_bytes[..used]) });
self.consume(used);
Ok(used)
}
}
}
}
impl<R: io::BufRead> BufRead for R {}
#[cfg(test)]
mod tests {
#[test]
fn readme_simple_example() {
use crate::BufRead;
use std::io::BufReader;
let mut buf = String::new();
assert_eq!(
4,
BufReader::<&[u8]>::new("💖".as_ref())
.read_utf8(&mut buf)
.unwrap()
);
assert_eq!("💖", buf.as_str());
}
#[test]
fn codepoint_on_buffer_boundary() {
use crate::BufRead;
use std::io::BufReader;
// 💖 is 4 bytes long
// String layout (grouped by 4 bytes): 0💖💖💖|💖0💖💖|💖💖0u|0💖💖💖|💖0u0|💖💖💖💖
// Should be read (grouped by read): 0|💖💖💖💖|0|💖💖💖💖|0u|0|💖💖💖💖|0u0|💖💖💖💖|
// 1 2 3 4 5 6 7 8 9
// incomplete codepoint <--/ / / / / / / / /
// recover codepoint <--/ / / / / / / /
// incomplete codepoint <--/ / / / / / /
// recover codepoint <--/ / / / / /
// buffer full <--/ / / / /
// incomplete codepoint <--/ / / /
// recover codepoint <--/ / /
// incomplete codepoint <--/ /
// recover codepoint <--/
// Sorry to all the users of non truly monospaced fonts for this horrendous diagram
let mut reader = BufReader::<&[u8]>::with_capacity(4, "0💖0💖0u0💖0u0💖".as_ref());
let mut buf = String::new();
// 1. Reading 1 byte until encountering 3 bytes of incomplete codepoint
assert_eq!(1, reader.read_utf8(&mut buf).unwrap());
assert_eq!("0", buf.as_str());
// 2. Reading the whole codepoint
assert_eq!(4, reader.read_utf8(&mut buf).unwrap());
assert_eq!("0💖", buf.as_str());
// 3. Reading 1 byte until encountering 2 bytes of incomplete codepoint
assert_eq!(1, reader.read_utf8(&mut buf).unwrap());
assert_eq!("0💖0", buf.as_str());
// 4. Reading the whole codepoint
assert_eq!(4, reader.read_utf8(&mut buf).unwrap());
assert_eq!("0💖0💖", buf.as_str());
// 5. Reading 2 byte until end of buffer
assert_eq!(2, reader.read_utf8(&mut buf).unwrap());
assert_eq!("0💖0💖0u", buf.as_str());
// 6. Reading 1 byte until encountering 3 bytes of incomplete codepoint
assert_eq!(1, reader.read_utf8(&mut buf).unwrap());
assert_eq!("0💖0💖0u0", buf.as_str());
// 7. Reading the whole codepoint
assert_eq!(4, reader.read_utf8(&mut buf).unwrap());
assert_eq!("0💖0💖0u0💖", buf.as_str());
// 8. Reading 3 byte until end of buffer
assert_eq!(3, reader.read_utf8(&mut buf).unwrap());
assert_eq!("0💖0💖0u0💖0u0", buf.as_str());
// 9. Reading 4 byte, the whole codepoint, until end of buffer
assert_eq!(4, reader.read_utf8(&mut buf).unwrap());
assert_eq!("0💖0💖0u0💖0u0💖", buf.as_str());
}
#[test]
fn two_bytes_capacity() {
use crate::BufRead;
use std::io::BufReader;
let mut reader = BufReader::<&[u8]>::with_capacity(2, "💖".as_ref());
let mut buf = String::new();
assert_eq!(4, reader.read_utf8(&mut buf).unwrap());
assert_eq!("💖", buf.as_str());
}
#[test]
fn one_byte_capacity() {
use crate::BufRead;
use std::io::{BufReader, ErrorKind};
// "€" is 3 bytes long
let mut reader = BufReader::<&[u8]>::with_capacity(1, "€".as_ref());
let mut buf = String::new();
let err = reader.read_utf8(&mut buf);
assert!(err.is_err());
let err = err.unwrap_err();
assert_eq!(ErrorKind::InvalidInput, err.kind());
}
}