use std::io::{self, BufReader, Read};
use utf8_zero::BufReadDecoder;
const DEFAULT_CAPACITY: usize = 8192;
pub(crate) struct Utf8ChunkReader {
decoder: BufReadDecoder<BufReader<Box<dyn Read + Send>>>,
}
impl Utf8ChunkReader {
pub(crate) fn new(reader: Box<dyn Read + Send>) -> Self {
Self::with_capacity(DEFAULT_CAPACITY, reader)
}
pub(crate) fn with_capacity(capacity: usize, reader: Box<dyn Read + Send>) -> Self {
let buf_reader = BufReader::with_capacity(capacity, reader);
Self {
decoder: BufReadDecoder::new(buf_reader),
}
}
pub(crate) fn next_chunk(&mut self) -> io::Result<Option<String>> {
match self.decoder.next_strict() {
Some(Ok(chunk)) => Ok(Some(chunk.to_string())),
Some(Err(e)) => Err(io::Error::new(io::ErrorKind::InvalidData, e.to_string())),
None => Ok(None),
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use assertables::*;
fn reader_from(s: &str) -> Box<dyn Read + Send> {
Box::new(io::Cursor::new(s.as_bytes().to_vec()))
}
#[test]
fn empty_input_returns_none() {
let mut reader = Utf8ChunkReader::new(reader_from(""));
let chunk = reader.next_chunk().unwrap();
assert_none!(&chunk);
}
#[test]
fn small_input_returns_one_chunk() {
let mut reader = Utf8ChunkReader::new(reader_from("hello world"));
let chunk = reader.next_chunk().unwrap();
assert_eq!(chunk.unwrap(), "hello world");
let chunk = reader.next_chunk().unwrap();
assert_none!(&chunk);
}
#[test]
fn large_input_returns_multiple_chunks() {
let input: String = "abcdefgh".repeat(2000); let mut reader = Utf8ChunkReader::new(reader_from(&input));
let mut reassembled = String::new();
let mut chunk_count = 0;
while let Some(chunk) = reader.next_chunk().unwrap() {
reassembled.push_str(&chunk);
chunk_count += 1;
}
assert_eq!(reassembled, input);
assert_gt!(chunk_count, 1);
}
#[test]
fn multibyte_chars_not_split() {
let input = "é".repeat(5000); let mut reader = Utf8ChunkReader::new(reader_from(&input));
let mut reassembled = String::new();
while let Some(chunk) = reader.next_chunk().unwrap() {
reassembled.push_str(&chunk);
}
assert_eq!(reassembled, input);
}
#[test]
fn tiny_capacity_forces_many_chunks() {
let input = "hello world, this is a test";
let mut reader = Utf8ChunkReader::with_capacity(4, reader_from(input));
let mut reassembled = String::new();
let mut chunk_count = 0;
while let Some(chunk) = reader.next_chunk().unwrap() {
assert_le!(chunk.len(), 4);
reassembled.push_str(&chunk);
chunk_count += 1;
}
assert_eq!(reassembled, input);
assert_gt!(chunk_count, 5);
}
#[test]
fn all_multibyte_with_small_capacity() {
let input = "😀😀😀😀😀";
let mut reader = Utf8ChunkReader::with_capacity(4, reader_from(input));
let mut reassembled = String::new();
while let Some(chunk) = reader.next_chunk().unwrap() {
reassembled.push_str(&chunk);
}
assert_eq!(reassembled, input);
}
}