use std::str;
#[derive(Debug, Default)]
pub struct Utf8StreamDecoder {
buffer: Vec<u8>,
}
impl Utf8StreamDecoder {
pub fn new() -> Self {
Self { buffer: Vec::new() }
}
pub fn decode(&mut self, chunk: &[u8]) -> String {
if chunk.is_empty() {
return String::new();
}
self.buffer.extend_from_slice(chunk);
let last_complete_index = self.find_last_complete_utf8_index();
if last_complete_index.is_none() {
return String::new();
}
let last_complete_index = last_complete_index.unwrap();
let complete_bytes = &self.buffer[..=last_complete_index];
let result = match str::from_utf8(complete_bytes) {
Ok(s) => s.to_string(),
Err(_) => {
self.buffer.clear();
return String::new();
}
};
let remaining_bytes = self.buffer[last_complete_index + 1..].to_vec();
self.buffer = remaining_bytes;
result
}
pub fn flush(&mut self) -> String {
if self.buffer.is_empty() {
return String::new();
}
let result = match str::from_utf8(&self.buffer) {
Ok(s) => s.to_string(),
Err(_) => {
String::new()
}
};
self.buffer.clear();
result
}
pub fn reset(&mut self) {
self.buffer.clear();
}
pub fn has_buffered_bytes(&self) -> bool {
!self.buffer.is_empty()
}
pub fn buffered_byte_count(&self) -> usize {
self.buffer.len()
}
fn find_last_complete_utf8_index(&self) -> Option<usize> {
if self.buffer.is_empty() {
return None;
}
for i in (0..self.buffer.len()).rev() {
let byte = self.buffer[i];
if byte <= 0x7F {
return Some(i);
}
if (byte & 0xC0) == 0xC0 {
let expected_length = if (byte & 0xE0) == 0xC0 {
2 } else if (byte & 0xF0) == 0xE0 {
3 } else if (byte & 0xF8) == 0xF0 {
4 } else {
continue;
};
let available_length = self.buffer.len() - i;
if available_length >= expected_length {
let mut is_valid = true;
for j in 1..expected_length {
if i + j >= self.buffer.len() || (self.buffer[i + j] & 0xC0) != 0x80 {
is_valid = false;
break;
}
}
if is_valid {
return Some(i + expected_length - 1);
}
}
if i > 0 {
let temp_buffer = &self.buffer[..i];
let temp_decoder = Utf8StreamDecoder {
buffer: temp_buffer.to_vec(),
};
return temp_decoder.find_last_complete_utf8_index();
} else {
return None;
}
}
}
None
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_ascii_characters() {
let mut decoder = Utf8StreamDecoder::new();
let result = decoder.decode(b"Hello");
assert_eq!(result, "Hello");
let result = decoder.decode(b" World");
assert_eq!(result, " World");
assert!(!decoder.has_buffered_bytes());
}
#[test]
fn test_complete_utf8_characters() {
let mut decoder = Utf8StreamDecoder::new();
let result = decoder.decode("你好".as_bytes());
assert_eq!(result, "你好");
assert!(!decoder.has_buffered_bytes());
}
#[test]
fn test_incomplete_utf8_sequences() {
let mut decoder = Utf8StreamDecoder::new();
let chunk1 = vec![0xE4, 0xB8]; let chunk2 = vec![0xAD];
let result1 = decoder.decode(&chunk1);
assert_eq!(result1, ""); assert!(decoder.has_buffered_bytes());
assert_eq!(decoder.buffered_byte_count(), 2);
let result2 = decoder.decode(&chunk2);
assert_eq!(result2, "中"); assert!(!decoder.has_buffered_bytes());
}
#[test]
fn test_mixed_content() {
let mut decoder = Utf8StreamDecoder::new();
let text = "Hello 你好 World 🌍";
let bytes = text.as_bytes();
let mut result = String::new();
for chunk in bytes.chunks(3) {
result.push_str(&decoder.decode(chunk));
}
result.push_str(&decoder.flush());
assert_eq!(result, text);
}
#[test]
fn test_emoji_sequences() {
let mut decoder = Utf8StreamDecoder::new();
let emoji = "🌍🚀✨";
let bytes = emoji.as_bytes();
let mut result = String::new();
for chunk in bytes.chunks(2) {
result.push_str(&decoder.decode(chunk));
}
result.push_str(&decoder.flush());
assert_eq!(result, emoji);
}
#[test]
fn test_flush() {
let mut decoder = Utf8StreamDecoder::new();
let incomplete = vec![0xE4, 0xB8]; let result = decoder.decode(&incomplete);
assert_eq!(result, "");
assert!(decoder.has_buffered_bytes());
let flushed = decoder.flush();
assert_eq!(flushed, "");
assert!(!decoder.has_buffered_bytes());
}
#[test]
fn test_reset() {
let mut decoder = Utf8StreamDecoder::new();
decoder.decode(b"Hello");
decoder.decode(&[0xE4, 0xB8]);
assert!(decoder.has_buffered_bytes());
decoder.reset();
assert!(!decoder.has_buffered_bytes());
assert_eq!(decoder.buffered_byte_count(), 0);
}
#[test]
fn test_empty_input() {
let mut decoder = Utf8StreamDecoder::new();
let result = decoder.decode(&[]);
assert_eq!(result, "");
let flushed = decoder.flush();
assert_eq!(flushed, "");
}
}