utf8_read/
reader.rs

1//a Imports
2use crate::{Char, Error, Result, StreamPosition};
3
4//a Constants
5/// [BUFFER_SIZE] is the maximum number of bytes held in the UTF-8
6/// character reader from the incoming stream.  The larger the value,
7/// the larger the data read requests from the stream. This value must be larger than `BUFFER_SLACK`.
8/// For testing purposes this value should be small (such as 8), to catch corner cases in the code where UTF-8 encodings
9/// run over the end of a buffer; for performance, this value should be larger (e.g. 2048).
10const BUFFER_SIZE  : usize = 2048;
11
12/// [BUFFER_SLACK] must be at least 4 - the maximum number of bytes in
13/// a UTF-8 encoding; when fewer than BUFFER_SLACK bytes are in the
14/// buffer a read from the buffer stream is performed - attempting to
15/// fill the `BUFFER_SIZE` buffer with current data and new read data.
16/// There is no reason why `BUFFER_SLACK` should be larger than 4.
17const BUFFER_SLACK : usize = 4;
18
19//a Reader
20//tp Reader
21/// The [Reader] provides a stream of characters by UTF-8 decoding a byte
22/// stream provided by any type that implements the [std::io::Read] stream trait.
23///
24/// It utilizes an internal buffer of bytes that are filled as
25/// required from the read stream; it maintains a position with the
26/// stream (line and character) for the next character, and provides
27/// the ability to get a stream of characters from the stream with any
28/// UTF-8 encoding errors reported by line and character.
29///
30/// The stream can be reclaimed by completing the use of the
31/// [Reader], in which case any unused bytes that have been read from
32/// the stream are also returned.
33///
34/// If simple short files are to be read, using
35/// [std::fs::read_to_string] may a better approach than using the
36/// `Reader`
37///
38/// # Example
39///
40/// ```
41///     use utf8_read::Reader;
42///     let str = "This is a \u{1f600} string\nWith a newline\n";
43///     let mut buf_bytes = str.as_bytes();
44///     let mut reader    = Reader::new(&mut buf_bytes);
45///     for x in reader.into_iter() {
46///         // use char x
47///     }
48/// ```
49///
50/// This example could just as easily use 'for x in str'
51///
52/// The [Reader], though, can be used over any object supporting the
53/// [Read](std::io::Read) trait such as a a
54/// [TcpStrema](std::net::TcpStream).
55///
56pub struct Reader<R:std::io::Read> {
57    /// The reader from which data is to be fetched
58    buf_reader  : R,
59    /// `eof_on_no_data` defaults to true; it can be set to false to indicate that
60    /// if the stream has no data then the reader should return Char::NoData
61    /// when its buffer does not contain a complete UTF-8 character
62    eof_on_no_data : bool,
63    /// `eof` is set when the stream is complete - any character
64    /// requested once `eof` is asserted will be `Char::Eof`.
65    eof        : bool,
66    /// Internal buffer
67    current    : [u8; BUFFER_SIZE],
68    /// Offset of the first byte within the internal buffer that is valid
69    start      : usize,
70    /// `Offset of the last byte + 1 within the internal buffer that is valid
71    end        : usize,
72    /// `valid_end` is the last byte + 1 within the internal buffer
73    /// used by a valid UTF-8 byte stream that begins with `start` As
74    /// such `start` <= `valid_end` <= `end` If `start` < `valid_end`
75    /// then the bytes in the buffer between the two are a valid UTF-8
76    /// byte stream; this should perhaps be kept in a string inside
77    /// the structure for performance
78    valid_end  : usize,
79    /// position in the file
80    stream_pos : StreamPosition,
81}
82
83//ip Reader
84impl <R:std::io::Read> Reader<R> {
85
86    //fp new
87    /// Returns a new UTF-8 character [Reader], with a stream position
88    /// set to the normal start of the file - byte 0, line 1,
89    /// character 1
90    ///
91    /// The [Reader] will default to handling zero bytes returned by
92    /// the stream as an EOF; to modify this default behavior use the
93    /// [set_eof_on_no_data](Reader::set_eof_on_no_data) builder to
94    /// modify the construction.
95    pub fn new(buf_reader: R) -> Self {
96        Self {
97            buf_reader,
98            eof_on_no_data : true,
99            eof            : false,
100            current        : [0; BUFFER_SIZE],
101            start          : 0,
102            end            : 0,
103            valid_end      : 0,
104            stream_pos     : StreamPosition::new(),
105        }
106    }
107
108    //cp set_eof_on_no_data
109    /// Build pattern function to set the `eof_on_no_data` on the [Reader] to true or false
110    ///
111    /// This should not need to be set dynamically; an external source
112    /// can set the eof flag directly if required using the
113    /// [set_eof](Reader::set_eof) method
114    pub fn set_eof_on_no_data(mut self, eof_on_no_data:bool) -> Self {
115        self.eof_on_no_data = eof_on_no_data;
116        self
117    }
118
119    //mp set_position
120    /// Set the current stream position
121    ///
122    /// This may be used if, for example, a stream is being restarted;
123    /// or if a UTF8 encoded stream occurs in the middle of a byte
124    /// file.
125    pub fn set_position(&mut self, stream_pos:StreamPosition) {
126        self.stream_pos = stream_pos;
127    }
128
129    //mp set_eof
130    /// Set the eof indicator as required; when `true` this will halt
131    /// any new data being returned, and the internal buffer points
132    /// will not change when more data is requested of the [Reader].
133    ///
134    /// This method may be invoked on behalf of a stream that has
135    /// completed, but that cannot indicate this by a read operation
136    /// returning zero bytes. For example, it may be used by an
137    /// application which uses a TcpStream for data, and which needs
138    /// to ensure future operations on the [Reader] return no more
139    /// data after the TcpStream has closed.
140    pub fn set_eof(&mut self, eof:bool) {
141        self.eof = eof;
142    }
143
144    //mp eof
145    /// Get the current eof indicator value.
146    ///
147    /// The `EOF` indication is normally set for [Reader]s that have a
148    /// stream that returns no data on a read operation, with that
149    /// behavior modified by the
150    /// [set_eof_on_no_data](Reader::set_eof_on_no_data) method.
151    pub fn eof(&self) -> bool {
152        self.eof
153    }
154
155    //mp complete
156    /// Finish with the stream, returning the buffer handle, the
157    /// position of the *next* character in the stream (if there were
158    /// to be one), and any unused buffer data.
159    pub fn complete(self) -> (R, StreamPosition, Vec<u8>) {
160        (self.buf_reader, self.stream_pos, self.current[self.start..self.end].into())
161    }
162
163    //mp drop_buffer
164    /// Drop the unconsumed data, for example after it has been borrowed and used, and before [complete](Reader::complete) is invoked
165    pub fn drop_buffer(&mut self) {
166        self.stream_pos.move_on_bytes(self.end - self.start);
167        self.start = self.end;
168    }
169
170    //mp buffer_is_empty
171    /// Returns true if the internal buffer is empty
172    pub fn buffer_is_empty(&self) -> bool {
173        self.start == self.end
174    }
175
176    //mp borrow_buffer
177    /// Borrow the data held in the [Reader]'s buffer.
178    pub fn borrow_buffer(&self) -> &[u8] {
179        &self.current[self.start..self.end]
180    }
181
182    //mp borrow_pos
183    /// Borrow the stream position of the next character to be returned
184    pub fn borrow_pos(&self) -> &StreamPosition {
185        &self.stream_pos
186    }
187
188    //mp borrow
189    /// Borrow the underlying stream
190    pub fn borrow(&self) -> &R {
191        &self.buf_reader
192    }
193
194    //mp borrow_mut
195    /// Borrow the underlying stream as a mutable reference
196    pub fn borrow_mut(&mut self) -> &mut R {
197        &mut self.buf_reader
198    }
199
200    //fi fetch_input
201    /// Fetch input from the underlying stream into the internal buffer,
202    /// moving valid data to the start of the buffer first if
203    /// required.  This method should only be invoked if more data is
204    /// required; it is relatively code-heavy.
205    fn fetch_input(&mut self) -> Result<usize> {
206        if self.start>BUFFER_SIZE-BUFFER_SLACK {
207            // Move everything down by self.start
208            let n = self.end - self.start;
209            if n>0 {
210                for i in 0..n {
211                    self.current[i] = self.current[self.start+i];
212                }
213            }
214            self.valid_end -= self.start;
215            self.start      = 0; // == self.start - self.start
216            self.end        = n; // == self.end   - self.start
217        }
218        let n = self.buf_reader.read( &mut self.current[self.end..BUFFER_SIZE] )?;
219        self.end += n;
220        if n==0 && self.eof_on_no_data {
221            self.eof = true;
222        }
223        Ok(n)
224    }
225
226    //mp next_char
227    /// Return the next character from the stream, if one is available, or [EOF](Char::Eof).
228    ///
229    /// If there is no data - or not enough data - from the underlying stream, and the [Reader] is operating with the underlying stream *not* indicating EOF with a zero-byte read result, then [NoData](Char::NoData) is returned.
230    ///
231    /// # Errors
232    ///
233    /// May return [Error::MalformedUtf8] if the next bytes in the stream do not make a well-formed UTF8 character.
234    ///
235    /// May return [Error::IoError] if the underlying stream has an IO Error.
236    pub fn next_char(&mut self) -> Result<Char> {
237        if self.eof {
238            Ok(Char::Eof)
239        } else if self.start == self.end { // no data present, try reading data
240            if self.fetch_input()? == 0 {
241                Ok(Char::NoData)
242            } else {
243                self.next_char()
244            }
245        } else if self.start < self.valid_end { // there is valid UTF-8 data at buffer+self.start
246            let s = {
247                // std::str::from_utf8(&self.current[self.start..self.valid_end]).unwrap()
248                unsafe {
249                    std::str::from_utf8_unchecked(&self.current[self.start..self.valid_end])
250                }
251            };
252            let ch = s.chars().next().unwrap();
253            let n = ch.len_utf8();
254            self.start += n;
255            self.stream_pos.move_by(n, ch);
256            Ok(Char::Char(ch))
257        } else { // there is data but it may or may not be valid
258            match std::str::from_utf8(&self.current[self.start..self.end]) {
259                Ok(_) => { // the data is valid, mark it and the return from there
260                    self.valid_end = self.end;
261                    self.next_char()
262                }
263                Err(e) => { // the data is not all valid
264                    if e.valid_up_to()>0 { // some bytes form valid UTF-8 - mark them and return that data
265                        self.valid_end = self.start+e.valid_up_to();
266                        self.next_char()
267                    } else { // no valid data - check it is just incomplete, or an actual error
268                        match e.error_len() {
269                            None => { // incomplete UTF-8 fetch more
270                                match self.fetch_input()? {
271                                    0 => { // ... and eof reached when incomplete UTF8 is present
272                                        if self.eof {
273                                            Error::malformed_utf8(self.stream_pos, self.end-self.start)
274                                        } else {
275                                            Ok(Char::NoData)
276                                        }
277                                    }
278                                    _ => { // ... but got more data so try that!
279                                        self.next_char()
280                                    }
281                                }
282                            }
283                            Some(n) => { // Bad UTF-8 with n bytes used
284                                let r = Error::malformed_utf8(self.stream_pos, n);
285                                self.stream_pos.move_on_bytes(n);
286                                self.start += n;
287                                r
288                            },
289                        }
290                    }
291                },
292            }
293        }
294    }
295
296    //zz All done
297}
298
299
300//ip Iterator for Reader - iterate over characters
301//
302// allow missing doc code examples for this as it *has* an example but
303// rustdoc does not pick it up.
304#[allow(missing_doc_code_examples)]
305impl <'a, R:std::io::Read> Iterator for &'a mut Reader<R> {
306    // we will be counting with usize
307    type Item = Result<char>;
308
309    //mp next - return next character or None if end of file
310    fn next(&mut self) -> Option<Self::Item> {
311        match self.next_char() {
312            Ok(Char::Char(ch)) => Some(Ok(ch)),
313            Ok(_)              => None,
314            Err(x)             => Some(Err(x)),
315        }
316    }
317
318    //zz All done
319}