1use std::{
2 collections::VecDeque,
3 error::Error,
4 fs::File,
5 io::{self, BufReader, Cursor, Read},
6 marker::PhantomData,
7 ops::{Deref, DerefMut},
8};
9
10use anyhow::anyhow;
11
12use crate::{CharacterError, CharacterIterator, MultiPeek, Peek, INTERRUPTED_MAX};
13
14pub trait Peekable<T> {
15 fn peek(&mut self) -> Option<&T>;
16}
17
18pub trait MultiPeekable<T> {
19 fn peek(&mut self) -> Option<&T>;
20 fn reset_peek(&mut self);
21}
22
23pub trait CharStream {
24 fn read_char(&mut self) -> CharacterStreamResult;
25 fn is_lossy(&self) -> bool;
26}
27
28pub type CharacterStreamResult = Result<char, CharacterError>;
30pub struct CharacterStream<Reader: Read> {
38 pub stream: Reader,
40 pub is_lossy: bool,
46}
47
48fn remaining_byte_count(byte: u8) -> Option<usize> {
49 let count = if (byte >> 7) == 0 {
50 0
52 } else if (byte >> 5) == 6 {
53 1
55 } else if (byte >> 4) == 14 {
56 2
58 } else if (byte >> 3) == 30 {
59 3
61 } else {
62 return None;
63 };
64
65 Some(count)
66}
67
68impl<Reader: Read> CharacterStream<Reader> {
69 pub fn new(stream: Reader, is_lossy: bool) -> Self {
73 Self { stream, is_lossy }
74 }
75
76 pub fn lossy(mut self, is_lossy: bool) -> Self {
78 self.is_lossy = is_lossy;
79 self
80 }
81
82 pub fn peeky(self) -> PeekableCharacterStream<Reader, Peek> {
84 self.into()
85 }
86
87 pub fn peeky_multi(self) -> PeekableCharacterStream<Reader, MultiPeek> {
89 self.into()
90 }
91
92 pub fn read_bytes(&mut self, amount: usize) -> Result<Vec<u8>, CharacterError> {
100 let handle = (&mut self.stream).take(amount as u64);
101 let result: Vec<Result<u8, io::Error>> = handle.bytes().collect();
102 let bytes: Vec<u8> = result
103 .iter()
104 .filter_map(|r| match r {
105 Ok(b) => Some(*b),
106 _ => None,
107 })
108 .collect();
109 let error = result.into_iter().find_map(|r| match r {
110 Err(error) => Some(error),
111 _ => None,
112 });
113
114 match error {
115 Some(error) => Err(CharacterError::IoError { bytes, error }),
116 None => {
117 let len = bytes.len();
118 if len == 0 {
119 Err(CharacterError::NoBytesRead)
120 } else if len != amount {
121 Err(CharacterError::Other {
122 bytes,
123 error: anyhow!("Failed to read the specified amount of bytes."),
124 })
125 } else {
126 Ok(bytes)
127 }
128 }
129 }
130 }
131
132 pub fn read_byte(&mut self) -> Result<u8, CharacterError> {
134 Ok(self.read_bytes(1)?[0])
135 }
136}
137
138impl<Reader: Read> CharStream for CharacterStream<Reader> {
139 fn read_char(&mut self) -> CharacterStreamResult {
145 match self.read_byte() {
146 Ok(read_byte) => match remaining_byte_count(read_byte) {
147 Some(remaining_count) => {
148 let mut bytes = vec![read_byte];
149 if remaining_count > 0 {
150 bytes.extend(self.read_bytes(remaining_count)?);
151 }
152 let chars: Vec<char> = match simdutf8::basic::from_utf8(&bytes) {
153 Ok(string) => string.chars().collect(),
154 Err(_) if self.is_lossy => vec!['\u{FFFD}'],
155 Err(error) => {
156 return Err(CharacterError::Other {
157 bytes,
158 error: anyhow!(error),
159 })
160 }
161 };
162
163 let len = chars.len();
164
165 if len == 1 {
166 Ok(chars[0])
167 } else {
168 Err(CharacterError::Other {
169 bytes,
170 error: anyhow!(format!("Expected 1 character, not {}", len)),
171 })
172 }
173 }
174 None => {
175 if self.is_lossy {
176 Ok('\u{FFFD}')
177 } else {
178 Err(CharacterError::Other {
179 bytes: vec![read_byte],
180 error: anyhow!("Invalid starting byte"),
181 })
182 }
183 }
184 },
185 Err(error) => return Err(error),
186 }
187 }
188
189 fn is_lossy(&self) -> bool {
190 self.is_lossy
191 }
192}
193
194impl<Reader: std::fmt::Debug + Read> std::fmt::Debug for CharacterStream<Reader> {
195 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
196 write!(f, "{:?}", self)
197 }
198}
199
200impl<Reader: Read> Deref for CharacterStream<Reader> {
201 type Target = Reader;
202
203 fn deref(&self) -> &Self::Target {
204 &self.stream
205 }
206}
207
208impl<Reader: Read> DerefMut for CharacterStream<Reader> {
209 fn deref_mut(&mut self) -> &mut Self::Target {
210 &mut self.stream
211 }
212}
213
214impl<Reader: Read> AsRef<Reader> for CharacterStream<Reader> {
215 fn as_ref(&self) -> &Reader {
216 &*self
217 }
218}
219
220impl<Reader: Read> AsMut<Reader> for CharacterStream<Reader> {
221 fn as_mut(&mut self) -> &mut Reader {
222 &mut *self
223 }
224}
225
226impl<Reader: Read> From<Reader> for CharacterStream<Reader> {
227 fn from(reader: Reader) -> Self {
228 Self::new(reader, false)
229 }
230}
231
232pub struct PeekableCharacterStream<Reader: Read, PI> {
233 pub stream: CharacterStream<Reader>,
234 pub buffer: VecDeque<CharacterStreamResult>,
235 pub position: usize,
236 _phantom: PhantomData<PI>,
237}
238
239impl<Reader: Read, PI> PeekableCharacterStream<Reader, PI> {
240 pub fn new(stream: Reader, is_lossy: bool) -> Self {
241 Self {
242 stream: CharacterStream::new(stream, is_lossy),
243 buffer: VecDeque::new(),
244 position: 0,
245 _phantom: PhantomData,
246 }
247 }
248
249 pub fn from_stream(stream: CharacterStream<Reader>) -> Self {
250 Self {
251 stream,
252 buffer: VecDeque::new(),
253 position: 0,
254 _phantom: PhantomData,
255 }
256 }
257
258 #[inline]
259 fn _read_char(&mut self) -> CharacterStreamResult {
260 self.buffer
261 .pop_front()
262 .unwrap_or_else(|| self.stream.read_char())
263 }
264}
265
266impl<Reader: Read, PI> From<CharacterStream<Reader>> for PeekableCharacterStream<Reader, PI> {
267 fn from(stream: CharacterStream<Reader>) -> Self {
268 Self::from_stream(stream)
269 }
270}
271
272impl<Reader: Read> Peekable<CharacterStreamResult> for PeekableCharacterStream<Reader, Peek> {
273 fn peek(&mut self) -> Option<&CharacterStreamResult> {
274 if self.buffer.len() == 1 {
275 return self.buffer.front();
276 }
277
278 let character_result = self.read_char();
279 self.buffer.push_back(character_result);
280
281 self.buffer.front()
282 }
283}
284
285impl<Reader: Read> MultiPeekable<CharacterStreamResult>
286 for PeekableCharacterStream<Reader, MultiPeek>
287{
288 fn peek(&mut self) -> Option<&CharacterStreamResult> {
289 let ret = if self.position < self.buffer.len() {
290 Some(&self.buffer[self.position])
291 } else {
292 match self.stream.read_char() {
293 Err(CharacterError::NoBytesRead) => None,
294 o => {
295 self.buffer.push_back(o);
296 Some(&self.buffer[self.position])
297 }
298 }
299 };
300
301 self.position += 1;
302 ret
303 }
304
305 fn reset_peek(&mut self) {
306 self.position = 0;
307 }
308}
309
310impl<Reader: Read> CharStream for PeekableCharacterStream<Reader, Peek> {
311 fn read_char(&mut self) -> CharacterStreamResult {
312 self._read_char()
313 }
314
315 fn is_lossy(&self) -> bool {
316 self.stream.is_lossy
317 }
318}
319
320impl<Reader: Read> CharStream for PeekableCharacterStream<Reader, MultiPeek> {
321 fn read_char(&mut self) -> CharacterStreamResult {
322 self.reset_peek();
323 self._read_char()
324 }
325
326 fn is_lossy(&self) -> bool {
327 self.stream.is_lossy
328 }
329}
330
331pub trait ToCharacterStream<Reader: Read> {
333 fn to_character_stream(&self) -> CharacterStream<Reader>;
335
336 fn to_character_stream_lossy(&self) -> CharacterStream<Reader>;
338}
339
340impl<T: AsRef<[u8]>> ToCharacterStream<Cursor<Vec<u8>>> for T {
341 fn to_character_stream(&self) -> CharacterStream<Cursor<Vec<u8>>> {
342 CharacterStream::from(Cursor::new(self.as_ref().to_vec()))
343 }
344
345 fn to_character_stream_lossy(&self) -> CharacterStream<Cursor<Vec<u8>>> {
346 CharacterStream::new(Cursor::new(self.as_ref().to_vec()), true)
347 }
348}
349
350pub trait TryToCharacterStream<Reader: Read> {
352 fn try_to_character_stream(&self) -> Result<CharacterStream<Reader>, Box<dyn Error>>;
354
355 fn try_to_character_stream_lossy(&self) -> Result<CharacterStream<Reader>, Box<dyn Error>>;
357}
358
359impl TryToCharacterStream<BufReader<File>> for File {
360 fn try_to_character_stream(&self) -> Result<CharacterStream<BufReader<File>>, Box<dyn Error>> {
361 let file = self.try_clone()?;
362 Ok(CharacterStream::from(BufReader::new(file)))
363 }
364
365 fn try_to_character_stream_lossy(
366 &self,
367 ) -> Result<CharacterStream<BufReader<File>>, Box<dyn Error>> {
368 let file = self.try_clone()?;
369 Ok(CharacterStream::new(BufReader::new(file), true))
370 }
371}
372
373impl<Reader: Read> IntoIterator for CharacterStream<Reader> {
374 type Item = <Self::IntoIter as Iterator>::Item;
375
376 type IntoIter = CharacterIterator<Self>;
377
378 fn into_iter(self) -> Self::IntoIter {
379 CharacterIterator::new(self, INTERRUPTED_MAX)
380 }
381}
382
383impl<Reader: Read> IntoIterator for PeekableCharacterStream<Reader, Peek> {
384 type Item = <Self::IntoIter as Iterator>::Item;
385
386 type IntoIter = CharacterIterator<Self>;
387
388 fn into_iter(self) -> Self::IntoIter {
389 CharacterIterator::new(self, INTERRUPTED_MAX)
390 }
391}
392
393impl<Reader: Read> IntoIterator for PeekableCharacterStream<Reader, MultiPeek> {
394 type Item = <Self::IntoIter as Iterator>::Item;
395
396 type IntoIter = CharacterIterator<Self>;
397
398 fn into_iter(self) -> Self::IntoIter {
399 CharacterIterator::new(self, INTERRUPTED_MAX)
400 }
401}
402
403#[cfg(test)]
404mod tests {
405 use super::*;
406
407 #[test]
408 fn lossy_test() {
409 let mut character_stream =
410 b"These are valid characters \xF0\x9F\x92\xBB \xF0\x9F\x92\xBB \xF0\x9F\x92\xBB! The following bytes are not valid:\x80\xFF"
411 .to_character_stream_lossy().peeky_multi();
412
413 loop {
414 match character_stream.read_char() {
415 Ok(c) => {
416 println!("{:X?}; Next: {:?}", c, character_stream.peek());
417 }
418 Err(error) => match &error {
419 CharacterError::IoError {
420 bytes: _,
421 error: err,
422 } => {
423 let kind = err.kind();
424 if kind == std::io::ErrorKind::UnexpectedEof {
425 break;
426 } else {
427 panic!("{}", error)
428 }
429 }
430 CharacterError::NoBytesRead => break,
431 error => panic!("{}", error),
432 },
433 }
434 }
435
436 println!();
437 }
438}