use alloc::vec::Vec;
use core::str;
use crate::error::{Error, Result};
pub(crate) struct Utf8StreamBuffer {
buffer: Vec<u8>,
carry: Vec<u8>,
max_bytes: usize,
}
impl Utf8StreamBuffer {
#[inline]
pub(crate) fn new(max_bytes: usize) -> Self {
Self {
buffer: Vec::new(),
carry: Vec::with_capacity(4),
max_bytes,
}
}
#[inline]
pub(crate) fn set_max_bytes(&mut self, max_bytes: usize) {
self.max_bytes = max_bytes;
}
#[inline]
pub(crate) fn buffered_bytes(&self) -> usize {
self.buffer.len()
}
#[inline]
pub(crate) fn reset(&mut self) {
self.buffer.clear();
self.carry.clear();
}
pub(crate) fn update(&mut self, chunk: &[u8]) -> Result<()> {
if self.buffer.len().saturating_add(chunk.len()) > self.max_bytes {
return Err(Error::InvalidInput("streaming buffer exceeded cap".into()));
}
let mut combined = core::mem::take(&mut self.carry);
combined.reserve(chunk.len());
combined.extend_from_slice(chunk);
let valid_up_to = match str::from_utf8(&combined) {
Ok(_) => combined.len(),
Err(e) => {
if e.error_len().is_some() {
return Err(Error::InvalidInput("invalid UTF-8 in stream".into()));
}
e.valid_up_to()
}
};
self.buffer.reserve(valid_up_to);
self.buffer.extend_from_slice(&combined[..valid_up_to]);
self.carry.clear();
self.carry.extend_from_slice(&combined[valid_up_to..]);
Ok(())
}
pub(crate) fn finalize_str(&self) -> Result<&str> {
if !self.carry.is_empty() {
return Err(Error::InvalidInput("trailing incomplete UTF-8".into()));
}
if self.buffer.is_empty() {
return Err(Error::InvalidInput("empty document".into()));
}
str::from_utf8(&self.buffer)
.map_err(|e| Error::InvalidInput(alloc::format!("internal UTF-8: {e}")))
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn empty_finalize_errors() {
let b = Utf8StreamBuffer::new(64);
assert!(matches!(b.finalize_str(), Err(Error::InvalidInput(_))));
}
#[test]
fn single_chunk_round_trip() {
let mut b = Utf8StreamBuffer::new(64);
b.update(b"hello world").unwrap();
assert_eq!(b.finalize_str().unwrap(), "hello world");
}
#[test]
fn split_multibyte_assembles() {
let mut b = Utf8StreamBuffer::new(64);
b.update(&[0xC3]).unwrap();
b.update(&[0xA9]).unwrap();
b.update(b" world").unwrap();
assert_eq!(b.finalize_str().unwrap(), "é world");
}
#[test]
fn invalid_utf8_lone_continuation_errors() {
let mut b = Utf8StreamBuffer::new(64);
assert!(matches!(b.update(&[0x80]), Err(Error::InvalidInput(_))));
}
#[test]
fn cap_enforced_on_update() {
let mut b = Utf8StreamBuffer::new(8);
b.update(b"01234567").unwrap();
assert!(matches!(b.update(b"8"), Err(Error::InvalidInput(_))));
}
#[test]
fn finalize_rejects_trailing_carry() {
let mut b = Utf8StreamBuffer::new(64);
b.update(&[0xC3]).unwrap();
assert!(matches!(b.finalize_str(), Err(Error::InvalidInput(_))));
}
#[test]
fn reset_clears_state() {
let mut b = Utf8StreamBuffer::new(64);
b.update(b"hello").unwrap();
b.reset();
assert_eq!(b.buffered_bytes(), 0);
assert!(matches!(b.finalize_str(), Err(Error::InvalidInput(_))));
}
#[test]
fn buffered_bytes_excludes_carry() {
let mut b = Utf8StreamBuffer::new(64);
b.update(b"abc").unwrap();
assert_eq!(b.buffered_bytes(), 3);
b.update(&[0xC3]).unwrap(); assert_eq!(b.buffered_bytes(), 3);
}
}