character_stream/
character_stream.rs

1use std::{
2    collections::VecDeque,
3    error::Error,
4    fs::File,
5    io::{self, BufReader, Cursor, Read},
6    marker::PhantomData,
7    ops::{Deref, DerefMut},
8};
9
10use anyhow::anyhow;
11
12use crate::{CharacterError, CharacterIterator, MultiPeek, Peek, INTERRUPTED_MAX};
13
14pub trait Peekable<T> {
15    fn peek(&mut self) -> Option<&T>;
16}
17
18pub trait MultiPeekable<T> {
19    fn peek(&mut self) -> Option<&T>;
20    fn reset_peek(&mut self);
21}
22
23pub trait CharStream {
24    fn read_char(&mut self) -> CharacterStreamResult;
25    fn is_lossy(&self) -> bool;
26}
27
28/// A result that contains a parsed character or a [CharacterStreamError].
29pub type CharacterStreamResult = Result<char, CharacterError>;
30/// Wrapper struct for any stream that implements [BufRead](std::io::BufRead) and [Seek](std::io::Seek).
31///
32/// It allows you to read in bytes from a stream, and attempt to parse them into characters.
33///
34/// These bytes however, must be valid UTF-8 code points.
35///
36/// This wrapper does NOT parse graphemes.
37pub struct CharacterStream<Reader: Read> {
38    /// The stream from which the incoming bytes are from.
39    pub stream: Reader,
40    /// Whether or not we should care whether invalid bytes are detected.
41    ///
42    /// If `true`, then invalid byte sequences will be replaced with a U+FFFD.
43    ///
44    /// If `false`, then an error will be returned.
45    pub is_lossy: bool,
46}
47
48fn remaining_byte_count(byte: u8) -> Option<usize> {
49    let count = if (byte >> 7) == 0 {
50        // Single byte character
51        0
52    } else if (byte >> 5) == 6 {
53        // Two byte character
54        1
55    } else if (byte >> 4) == 14 {
56        // Three byte character
57        2
58    } else if (byte >> 3) == 30 {
59        // Four byte character
60        3
61    } else {
62        return None;
63    };
64
65    Some(count)
66}
67
68impl<Reader: Read> CharacterStream<Reader> {
69    /// Create a [CharacterStream] from a stream.
70    ///
71    /// Set `is_lossy` to `true` if you don't want to handle invalid byte sequences.
72    pub fn new(stream: Reader, is_lossy: bool) -> Self {
73        Self { stream, is_lossy }
74    }
75
76    /// Kinda builder pattern.
77    pub fn lossy(mut self, is_lossy: bool) -> Self {
78        self.is_lossy = is_lossy;
79        self
80    }
81
82    /// Wrap `self` into a single-peek [PeekableCharacterStream].
83    pub fn peeky(self) -> PeekableCharacterStream<Reader, Peek> {
84        self.into()
85    }
86
87    /// Wrap `self` into a multi-peek [PeekableCharacterStream].
88    pub fn peeky_multi(self) -> PeekableCharacterStream<Reader, MultiPeek> {
89        self.into()
90    }
91
92    /// Reads a set amount of bytes from the stream.
93    ///
94    /// Set `amount` to the amount of bytes you would like to read.
95    ///
96    /// Upon success, a [`Vec<u8>`] is returned, holding the read bytes.
97    ///
98    /// Upon failure, an [error](CharacterError) is returned.
99    pub fn read_bytes(&mut self, amount: usize) -> Result<Vec<u8>, CharacterError> {
100        let handle = (&mut self.stream).take(amount as u64);
101        let result: Vec<Result<u8, io::Error>> = handle.bytes().collect();
102        let bytes: Vec<u8> = result
103            .iter()
104            .filter_map(|r| match r {
105                Ok(b) => Some(*b),
106                _ => None,
107            })
108            .collect();
109        let error = result.into_iter().find_map(|r| match r {
110            Err(error) => Some(error),
111            _ => None,
112        });
113
114        match error {
115            Some(error) => Err(CharacterError::IoError { bytes, error }),
116            None => {
117                let len = bytes.len();
118                if len == 0 {
119                    Err(CharacterError::NoBytesRead)
120                } else if len != amount {
121                    Err(CharacterError::Other {
122                        bytes,
123                        error: anyhow!("Failed to read the specified amount of bytes."),
124                    })
125                } else {
126                    Ok(bytes)
127                }
128            }
129        }
130    }
131
132    /// Reads a singluar byte from the stream.
133    pub fn read_byte(&mut self) -> Result<u8, CharacterError> {
134        Ok(self.read_bytes(1)?[0])
135    }
136}
137
138impl<Reader: Read> CharStream for CharacterStream<Reader> {
139    /// Attempts to read a character from the stream.
140    ///
141    /// If `is_lossy` is set to `true`, then invalid byte sequences will be a U+FFFD.
142    ///
143    /// If `is_lossy` is set to `false`, then invalid byte sequences will be returned in addition to a parse error.
144    fn read_char(&mut self) -> CharacterStreamResult {
145        match self.read_byte() {
146            Ok(read_byte) => match remaining_byte_count(read_byte) {
147                Some(remaining_count) => {
148                    let mut bytes = vec![read_byte];
149                    if remaining_count > 0 {
150                        bytes.extend(self.read_bytes(remaining_count)?);
151                    }
152                    let chars: Vec<char> = match simdutf8::basic::from_utf8(&bytes) {
153                        Ok(string) => string.chars().collect(),
154                        Err(_) if self.is_lossy => vec!['\u{FFFD}'],
155                        Err(error) => {
156                            return Err(CharacterError::Other {
157                                bytes,
158                                error: anyhow!(error),
159                            })
160                        }
161                    };
162
163                    let len = chars.len();
164
165                    if len == 1 {
166                        Ok(chars[0])
167                    } else {
168                        Err(CharacterError::Other {
169                            bytes,
170                            error: anyhow!(format!("Expected 1 character, not {}", len)),
171                        })
172                    }
173                }
174                None => {
175                    if self.is_lossy {
176                        Ok('\u{FFFD}')
177                    } else {
178                        Err(CharacterError::Other {
179                            bytes: vec![read_byte],
180                            error: anyhow!("Invalid starting byte"),
181                        })
182                    }
183                }
184            },
185            Err(error) => return Err(error),
186        }
187    }
188
189    fn is_lossy(&self) -> bool {
190        self.is_lossy
191    }
192}
193
194impl<Reader: std::fmt::Debug + Read> std::fmt::Debug for CharacterStream<Reader> {
195    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
196        write!(f, "{:?}", self)
197    }
198}
199
200impl<Reader: Read> Deref for CharacterStream<Reader> {
201    type Target = Reader;
202
203    fn deref(&self) -> &Self::Target {
204        &self.stream
205    }
206}
207
208impl<Reader: Read> DerefMut for CharacterStream<Reader> {
209    fn deref_mut(&mut self) -> &mut Self::Target {
210        &mut self.stream
211    }
212}
213
214impl<Reader: Read> AsRef<Reader> for CharacterStream<Reader> {
215    fn as_ref(&self) -> &Reader {
216        &*self
217    }
218}
219
220impl<Reader: Read> AsMut<Reader> for CharacterStream<Reader> {
221    fn as_mut(&mut self) -> &mut Reader {
222        &mut *self
223    }
224}
225
226impl<Reader: Read> From<Reader> for CharacterStream<Reader> {
227    fn from(reader: Reader) -> Self {
228        Self::new(reader, false)
229    }
230}
231
232pub struct PeekableCharacterStream<Reader: Read, PI> {
233    pub stream: CharacterStream<Reader>,
234    pub buffer: VecDeque<CharacterStreamResult>,
235    pub position: usize,
236    _phantom: PhantomData<PI>,
237}
238
239impl<Reader: Read, PI> PeekableCharacterStream<Reader, PI> {
240    pub fn new(stream: Reader, is_lossy: bool) -> Self {
241        Self {
242            stream: CharacterStream::new(stream, is_lossy),
243            buffer: VecDeque::new(),
244            position: 0,
245            _phantom: PhantomData,
246        }
247    }
248
249    pub fn from_stream(stream: CharacterStream<Reader>) -> Self {
250        Self {
251            stream,
252            buffer: VecDeque::new(),
253            position: 0,
254            _phantom: PhantomData,
255        }
256    }
257
258    #[inline]
259    fn _read_char(&mut self) -> CharacterStreamResult {
260        self.buffer
261            .pop_front()
262            .unwrap_or_else(|| self.stream.read_char())
263    }
264}
265
266impl<Reader: Read, PI> From<CharacterStream<Reader>> for PeekableCharacterStream<Reader, PI> {
267    fn from(stream: CharacterStream<Reader>) -> Self {
268        Self::from_stream(stream)
269    }
270}
271
272impl<Reader: Read> Peekable<CharacterStreamResult> for PeekableCharacterStream<Reader, Peek> {
273    fn peek(&mut self) -> Option<&CharacterStreamResult> {
274        if self.buffer.len() == 1 {
275            return self.buffer.front();
276        }
277
278        let character_result = self.read_char();
279        self.buffer.push_back(character_result);
280
281        self.buffer.front()
282    }
283}
284
285impl<Reader: Read> MultiPeekable<CharacterStreamResult>
286    for PeekableCharacterStream<Reader, MultiPeek>
287{
288    fn peek(&mut self) -> Option<&CharacterStreamResult> {
289        let ret = if self.position < self.buffer.len() {
290            Some(&self.buffer[self.position])
291        } else {
292            match self.stream.read_char() {
293                Err(CharacterError::NoBytesRead) => None,
294                o => {
295                    self.buffer.push_back(o);
296                    Some(&self.buffer[self.position])
297                }
298            }
299        };
300
301        self.position += 1;
302        ret
303    }
304
305    fn reset_peek(&mut self) {
306        self.position = 0;
307    }
308}
309
310impl<Reader: Read> CharStream for PeekableCharacterStream<Reader, Peek> {
311    fn read_char(&mut self) -> CharacterStreamResult {
312        self._read_char()
313    }
314
315    fn is_lossy(&self) -> bool {
316        self.stream.is_lossy
317    }
318}
319
320impl<Reader: Read> CharStream for PeekableCharacterStream<Reader, MultiPeek> {
321    fn read_char(&mut self) -> CharacterStreamResult {
322        self.reset_peek();
323        self._read_char()
324    }
325
326    fn is_lossy(&self) -> bool {
327        self.stream.is_lossy
328    }
329}
330
331/// Helper trait for converting values into a [CharacterStream].
332pub trait ToCharacterStream<Reader: Read> {
333    /// Convert into a [CharacterStream].
334    fn to_character_stream(&self) -> CharacterStream<Reader>;
335
336    /// Convert into a lossy [CharacterStream].
337    fn to_character_stream_lossy(&self) -> CharacterStream<Reader>;
338}
339
340impl<T: AsRef<[u8]>> ToCharacterStream<Cursor<Vec<u8>>> for T {
341    fn to_character_stream(&self) -> CharacterStream<Cursor<Vec<u8>>> {
342        CharacterStream::from(Cursor::new(self.as_ref().to_vec()))
343    }
344
345    fn to_character_stream_lossy(&self) -> CharacterStream<Cursor<Vec<u8>>> {
346        CharacterStream::new(Cursor::new(self.as_ref().to_vec()), true)
347    }
348}
349
350/// Helper trait for converting values into a [CharacterStream], with a potential for failure.
351pub trait TryToCharacterStream<Reader: Read> {
352    /// Attempt to convert into a [CharacterStream].
353    fn try_to_character_stream(&self) -> Result<CharacterStream<Reader>, Box<dyn Error>>;
354
355    /// Attempt to convert into a lossy [CharacterStream].
356    fn try_to_character_stream_lossy(&self) -> Result<CharacterStream<Reader>, Box<dyn Error>>;
357}
358
359impl TryToCharacterStream<BufReader<File>> for File {
360    fn try_to_character_stream(&self) -> Result<CharacterStream<BufReader<File>>, Box<dyn Error>> {
361        let file = self.try_clone()?;
362        Ok(CharacterStream::from(BufReader::new(file)))
363    }
364
365    fn try_to_character_stream_lossy(
366        &self,
367    ) -> Result<CharacterStream<BufReader<File>>, Box<dyn Error>> {
368        let file = self.try_clone()?;
369        Ok(CharacterStream::new(BufReader::new(file), true))
370    }
371}
372
373impl<Reader: Read> IntoIterator for CharacterStream<Reader> {
374    type Item = <Self::IntoIter as Iterator>::Item;
375
376    type IntoIter = CharacterIterator<Self>;
377
378    fn into_iter(self) -> Self::IntoIter {
379        CharacterIterator::new(self, INTERRUPTED_MAX)
380    }
381}
382
383impl<Reader: Read> IntoIterator for PeekableCharacterStream<Reader, Peek> {
384    type Item = <Self::IntoIter as Iterator>::Item;
385
386    type IntoIter = CharacterIterator<Self>;
387
388    fn into_iter(self) -> Self::IntoIter {
389        CharacterIterator::new(self, INTERRUPTED_MAX)
390    }
391}
392
393impl<Reader: Read> IntoIterator for PeekableCharacterStream<Reader, MultiPeek> {
394    type Item = <Self::IntoIter as Iterator>::Item;
395
396    type IntoIter = CharacterIterator<Self>;
397
398    fn into_iter(self) -> Self::IntoIter {
399        CharacterIterator::new(self, INTERRUPTED_MAX)
400    }
401}
402
403#[cfg(test)]
404mod tests {
405    use super::*;
406
407    #[test]
408    fn lossy_test() {
409        let mut character_stream =
410            b"These are valid characters \xF0\x9F\x92\xBB \xF0\x9F\x92\xBB \xF0\x9F\x92\xBB! The following bytes are not valid:\x80\xFF"
411                .to_character_stream_lossy().peeky_multi();
412
413        loop {
414            match character_stream.read_char() {
415                Ok(c) => {
416                    println!("{:X?}; Next: {:?}", c, character_stream.peek());
417                }
418                Err(error) => match &error {
419                    CharacterError::IoError {
420                        bytes: _,
421                        error: err,
422                    } => {
423                        let kind = err.kind();
424                        if kind == std::io::ErrorKind::UnexpectedEof {
425                            break;
426                        } else {
427                            panic!("{}", error)
428                        }
429                    }
430                    CharacterError::NoBytesRead => break,
431                    error => panic!("{}", error),
432                },
433            }
434        }
435
436        println!();
437    }
438}