Skip to main content

fhp_encoding/
stream.rs

1//! Streaming decoder for chunk-based processing.
2//!
3//! [`DecodingReader`] wraps a [`Read`](std::io::Read) source and decodes
4//! bytes on-the-fly into UTF-8. This is useful for large inputs where
5//! loading the entire document into memory is undesirable.
6
7use std::io::{self, Read};
8
9use encoding_rs::{Decoder, Encoding};
10
11/// A streaming decoder that wraps a byte source and produces UTF-8 output.
12///
13/// Reads chunks from the inner reader, decodes them using the configured
14/// encoding, and writes UTF-8 bytes into the caller's buffer.
15///
16/// # Example
17///
18/// ```
19/// use fhp_encoding::DecodingReader;
20/// use std::io::Read;
21///
22/// let data = b"Hello, world!";
23/// let mut reader = DecodingReader::new(&data[..], encoding_rs::UTF_8);
24/// let mut output = String::new();
25/// reader.read_to_string(&mut output).unwrap();
26/// assert_eq!(output, "Hello, world!");
27/// ```
28pub struct DecodingReader<R> {
29    inner: R,
30    decoder: Decoder,
31    /// Raw bytes read from `inner` but not yet consumed by the decoder.
32    raw_buf: Vec<u8>,
33    /// Number of valid bytes in `raw_buf`.
34    raw_len: usize,
35    /// Decoded UTF-8 bytes ready to be returned to the caller.
36    decoded_buf: Vec<u8>,
37    /// Read cursor into `decoded_buf`.
38    decoded_pos: usize,
39    /// Number of valid decoded bytes in `decoded_buf`.
40    decoded_len: usize,
41    /// Whether the inner reader has reached EOF.
42    eof: bool,
43}
44
45/// Default chunk size for reading from the inner source (8 KB).
46const CHUNK_SIZE: usize = 8192;
47
48impl<R: Read> DecodingReader<R> {
49    /// Create a new streaming decoder with the given encoding.
50    pub fn new(inner: R, encoding: &'static Encoding) -> Self {
51        Self {
52            inner,
53            decoder: encoding.new_decoder(),
54            raw_buf: vec![0u8; CHUNK_SIZE],
55            raw_len: 0,
56            decoded_buf: vec![0u8; CHUNK_SIZE * 4], // worst case: 4 bytes per input byte
57            decoded_pos: 0,
58            decoded_len: 0,
59            eof: false,
60        }
61    }
62
63    /// Fill the decoded buffer by reading from the inner source and decoding.
64    fn fill_decoded(&mut self) -> io::Result<()> {
65        // If there's still data in the decoded buffer, don't refill.
66        if self.decoded_pos < self.decoded_len {
67            return Ok(());
68        }
69
70        // Reset decoded buffer.
71        self.decoded_pos = 0;
72        self.decoded_len = 0;
73
74        if self.eof && self.raw_len == 0 {
75            return Ok(());
76        }
77
78        // Read more raw bytes if needed.
79        if self.raw_len == 0 && !self.eof {
80            let n = self.inner.read(&mut self.raw_buf)?;
81            if n == 0 {
82                self.eof = true;
83            } else {
84                self.raw_len = n;
85            }
86        }
87
88        // Decode the raw buffer.
89        let (result, read, written, _had_errors) = self.decoder.decode_to_utf8(
90            &self.raw_buf[..self.raw_len],
91            &mut self.decoded_buf,
92            self.eof,
93        );
94
95        // Shift unconsumed raw bytes to the front.
96        if read < self.raw_len {
97            self.raw_buf.copy_within(read..self.raw_len, 0);
98            self.raw_len -= read;
99        } else {
100            self.raw_len = 0;
101        }
102
103        self.decoded_len = written;
104
105        // If output_full, we need the caller to drain before decoding more.
106        if let encoding_rs::CoderResult::OutputFull = result {
107            // That's fine — caller will read, then we'll decode more.
108        }
109
110        Ok(())
111    }
112}
113
114impl<R: Read> Read for DecodingReader<R> {
115    fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
116        if self.decoded_pos >= self.decoded_len {
117            self.fill_decoded()?;
118        }
119
120        if self.decoded_pos >= self.decoded_len {
121            return Ok(0); // EOF
122        }
123
124        let available = self.decoded_len - self.decoded_pos;
125        let to_copy = available.min(buf.len());
126        buf[..to_copy]
127            .copy_from_slice(&self.decoded_buf[self.decoded_pos..self.decoded_pos + to_copy]);
128        self.decoded_pos += to_copy;
129        Ok(to_copy)
130    }
131}
132
133#[cfg(test)]
134mod tests {
135    use super::*;
136
137    #[test]
138    fn stream_utf8() {
139        let data = b"Hello, world!";
140        let mut reader = DecodingReader::new(&data[..], encoding_rs::UTF_8);
141        let mut output = String::new();
142        reader.read_to_string(&mut output).unwrap();
143        assert_eq!(output, "Hello, world!");
144    }
145
146    #[test]
147    fn stream_windows_1254_turkish() {
148        // ş=0xFE, ğ=0xF0, ı=0xFD in Windows-1254
149        let data: &[u8] = &[0xFE, 0xF0, 0xFD];
150        let mut reader = DecodingReader::new(data, encoding_rs::WINDOWS_1254);
151        let mut output = String::new();
152        reader.read_to_string(&mut output).unwrap();
153        assert_eq!(output, "\u{015F}\u{011F}\u{0131}"); // ş, ğ, ı
154    }
155
156    #[test]
157    fn stream_empty() {
158        let data: &[u8] = b"";
159        let mut reader = DecodingReader::new(data, encoding_rs::UTF_8);
160        let mut output = String::new();
161        reader.read_to_string(&mut output).unwrap();
162        assert_eq!(output, "");
163    }
164
165    #[test]
166    fn stream_large_input() {
167        // Create input larger than CHUNK_SIZE to test multi-chunk decoding.
168        let data = "abcdefgh".repeat(2000); // 16KB
169        let mut reader = DecodingReader::new(data.as_bytes(), encoding_rs::UTF_8);
170        let mut output = String::new();
171        reader.read_to_string(&mut output).unwrap();
172        assert_eq!(output, data);
173    }
174
175    #[test]
176    fn stream_small_read_buf() {
177        let data = b"Hello, world!";
178        let mut reader = DecodingReader::new(&data[..], encoding_rs::UTF_8);
179        let mut output = Vec::new();
180        let mut buf = [0u8; 3]; // Read 3 bytes at a time.
181        loop {
182            let n = reader.read(&mut buf).unwrap();
183            if n == 0 {
184                break;
185            }
186            output.extend_from_slice(&buf[..n]);
187        }
188        assert_eq!(String::from_utf8(output).unwrap(), "Hello, world!");
189    }
190}