#[derive(Debug, Default)]
pub struct Utf8Stream {
pending: Vec<u8>,
}
impl Utf8Stream {
pub fn new() -> Self {
Self::default()
}
pub fn feed(&mut self, chunk: &[u8]) -> String {
let mut bytes = std::mem::take(&mut self.pending);
bytes.extend_from_slice(chunk);
let mut out = String::new();
let mut start = 0;
loop {
let slice = &bytes[start..];
match std::str::from_utf8(slice) {
Ok(s) => {
out.push_str(s);
return out;
}
Err(e) => {
let valid_up_to = e.valid_up_to();
out.push_str(unsafe { std::str::from_utf8_unchecked(&slice[..valid_up_to]) });
match e.error_len() {
None => {
self.pending = slice[valid_up_to..].to_vec();
return out;
}
Some(len) => {
out.push('\u{FFFD}');
start += valid_up_to + len;
}
}
}
}
}
}
pub fn pending_len(&self) -> usize {
self.pending.len()
}
pub fn flush(&mut self) -> String {
if self.pending.is_empty() {
String::new()
} else {
self.pending.clear();
"\u{FFFD}".to_string()
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn ascii_passes_through() {
let mut s = Utf8Stream::new();
assert_eq!(s.feed(b"hello"), "hello");
assert_eq!(s.feed(b" world"), " world");
assert!(s.pending.is_empty());
}
#[test]
fn split_4byte_codepoint_round_trips() {
let mut s = Utf8Stream::new();
let first = s.feed(&[0xF0, 0x9F]);
assert_eq!(first, "");
assert_eq!(s.pending, vec![0xF0, 0x9F]);
let second = s.feed(&[0x8E, 0x89]);
assert_eq!(second, "\u{1F389}");
assert!(s.pending.is_empty());
}
#[test]
fn split_3byte_codepoint_round_trips() {
let mut s = Utf8Stream::new();
assert_eq!(s.feed(&[0xE2]), "");
assert_eq!(s.feed(&[0x84, 0xA2]), "\u{2122}");
}
#[test]
fn invalid_byte_emits_replacement_and_recovers() {
let mut s = Utf8Stream::new();
let out = s.feed(&[b'a', 0xFF, b'b']);
assert_eq!(out, "a\u{FFFD}b");
assert!(s.pending.is_empty());
}
#[test]
fn carryover_is_bounded() {
let mut s = Utf8Stream::new();
for _ in 0..10 {
s.feed(&[0xF0]);
assert!(s.pending.len() <= 3, "pending grew beyond 3 bytes");
}
}
#[test]
fn lone_continuation_byte_emits_replacement() {
let mut s = Utf8Stream::new();
let out = s.feed(&[0x8E]);
assert_eq!(out, "\u{FFFD}");
assert!(s.pending.is_empty());
}
#[test]
fn pending_len_reports_carryover_size() {
let mut s = Utf8Stream::new();
assert_eq!(s.pending_len(), 0);
s.feed(b"hello");
assert_eq!(s.pending_len(), 0);
s.feed(&[0xF0, 0x9F]);
assert_eq!(s.pending_len(), 2);
s.feed(&[0x8E, 0x89]);
assert_eq!(s.pending_len(), 0);
}
#[test]
fn flush_emits_replacement_for_pending_bytes() {
let mut s = Utf8Stream::new();
s.feed(&[0xF0, 0x9F]);
assert_eq!(s.flush(), "\u{FFFD}");
assert_eq!(s.flush(), "");
}
}