utf8_read/reader.rs
1//a Imports
2use crate::{Char, Error, Result, StreamPosition};
3
4//a Constants
5/// [BUFFER_SIZE] is the maximum number of bytes held in the UTF-8
6/// character reader from the incoming stream. The larger the value,
7/// the larger the data read requests from the stream. This value must be larger than `BUFFER_SLACK`.
8/// For testing purposes this value should be small (such as 8), to catch corner cases in the code where UTF-8 encodings
9/// run over the end of a buffer; for performance, this value should be larger (e.g. 2048).
10const BUFFER_SIZE : usize = 2048;
11
12/// [BUFFER_SLACK] must be at least 4 - the maximum number of bytes in
13/// a UTF-8 encoding; when fewer than BUFFER_SLACK bytes are in the
14/// buffer a read from the buffer stream is performed - attempting to
15/// fill the `BUFFER_SIZE` buffer with current data and new read data.
16/// There is no reason why `BUFFER_SLACK` should be larger than 4.
17const BUFFER_SLACK : usize = 4;
18
19//a Reader
20//tp Reader
21/// The [Reader] provides a stream of characters by UTF-8 decoding a byte
22/// stream provided by any type that implements the [std::io::Read] stream trait.
23///
24/// It utilizes an internal buffer of bytes that are filled as
25/// required from the read stream; it maintains a position with the
26/// stream (line and character) for the next character, and provides
27/// the ability to get a stream of characters from the stream with any
28/// UTF-8 encoding errors reported by line and character.
29///
30/// The stream can be reclaimed by completing the use of the
31/// [Reader], in which case any unused bytes that have been read from
32/// the stream are also returned.
33///
34/// If simple short files are to be read, using
35/// [std::fs::read_to_string] may a better approach than using the
36/// `Reader`
37///
38/// # Example
39///
40/// ```
41/// use utf8_read::Reader;
42/// let str = "This is a \u{1f600} string\nWith a newline\n";
43/// let mut buf_bytes = str.as_bytes();
44/// let mut reader = Reader::new(&mut buf_bytes);
45/// for x in reader.into_iter() {
46/// // use char x
47/// }
48/// ```
49///
50/// This example could just as easily use 'for x in str'
51///
52/// The [Reader], though, can be used over any object supporting the
53/// [Read](std::io::Read) trait such as a a
54/// [TcpStrema](std::net::TcpStream).
55///
56pub struct Reader<R:std::io::Read> {
57 /// The reader from which data is to be fetched
58 buf_reader : R,
59 /// `eof_on_no_data` defaults to true; it can be set to false to indicate that
60 /// if the stream has no data then the reader should return Char::NoData
61 /// when its buffer does not contain a complete UTF-8 character
62 eof_on_no_data : bool,
63 /// `eof` is set when the stream is complete - any character
64 /// requested once `eof` is asserted will be `Char::Eof`.
65 eof : bool,
66 /// Internal buffer
67 current : [u8; BUFFER_SIZE],
68 /// Offset of the first byte within the internal buffer that is valid
69 start : usize,
70 /// `Offset of the last byte + 1 within the internal buffer that is valid
71 end : usize,
72 /// `valid_end` is the last byte + 1 within the internal buffer
73 /// used by a valid UTF-8 byte stream that begins with `start` As
74 /// such `start` <= `valid_end` <= `end` If `start` < `valid_end`
75 /// then the bytes in the buffer between the two are a valid UTF-8
76 /// byte stream; this should perhaps be kept in a string inside
77 /// the structure for performance
78 valid_end : usize,
79 /// position in the file
80 stream_pos : StreamPosition,
81}
82
83//ip Reader
84impl <R:std::io::Read> Reader<R> {
85
86 //fp new
87 /// Returns a new UTF-8 character [Reader], with a stream position
88 /// set to the normal start of the file - byte 0, line 1,
89 /// character 1
90 ///
91 /// The [Reader] will default to handling zero bytes returned by
92 /// the stream as an EOF; to modify this default behavior use the
93 /// [set_eof_on_no_data](Reader::set_eof_on_no_data) builder to
94 /// modify the construction.
95 pub fn new(buf_reader: R) -> Self {
96 Self {
97 buf_reader,
98 eof_on_no_data : true,
99 eof : false,
100 current : [0; BUFFER_SIZE],
101 start : 0,
102 end : 0,
103 valid_end : 0,
104 stream_pos : StreamPosition::new(),
105 }
106 }
107
108 //cp set_eof_on_no_data
109 /// Build pattern function to set the `eof_on_no_data` on the [Reader] to true or false
110 ///
111 /// This should not need to be set dynamically; an external source
112 /// can set the eof flag directly if required using the
113 /// [set_eof](Reader::set_eof) method
114 pub fn set_eof_on_no_data(mut self, eof_on_no_data:bool) -> Self {
115 self.eof_on_no_data = eof_on_no_data;
116 self
117 }
118
119 //mp set_position
120 /// Set the current stream position
121 ///
122 /// This may be used if, for example, a stream is being restarted;
123 /// or if a UTF8 encoded stream occurs in the middle of a byte
124 /// file.
125 pub fn set_position(&mut self, stream_pos:StreamPosition) {
126 self.stream_pos = stream_pos;
127 }
128
129 //mp set_eof
130 /// Set the eof indicator as required; when `true` this will halt
131 /// any new data being returned, and the internal buffer points
132 /// will not change when more data is requested of the [Reader].
133 ///
134 /// This method may be invoked on behalf of a stream that has
135 /// completed, but that cannot indicate this by a read operation
136 /// returning zero bytes. For example, it may be used by an
137 /// application which uses a TcpStream for data, and which needs
138 /// to ensure future operations on the [Reader] return no more
139 /// data after the TcpStream has closed.
140 pub fn set_eof(&mut self, eof:bool) {
141 self.eof = eof;
142 }
143
144 //mp eof
145 /// Get the current eof indicator value.
146 ///
147 /// The `EOF` indication is normally set for [Reader]s that have a
148 /// stream that returns no data on a read operation, with that
149 /// behavior modified by the
150 /// [set_eof_on_no_data](Reader::set_eof_on_no_data) method.
151 pub fn eof(&self) -> bool {
152 self.eof
153 }
154
155 //mp complete
156 /// Finish with the stream, returning the buffer handle, the
157 /// position of the *next* character in the stream (if there were
158 /// to be one), and any unused buffer data.
159 pub fn complete(self) -> (R, StreamPosition, Vec<u8>) {
160 (self.buf_reader, self.stream_pos, self.current[self.start..self.end].into())
161 }
162
163 //mp drop_buffer
164 /// Drop the unconsumed data, for example after it has been borrowed and used, and before [complete](Reader::complete) is invoked
165 pub fn drop_buffer(&mut self) {
166 self.stream_pos.move_on_bytes(self.end - self.start);
167 self.start = self.end;
168 }
169
170 //mp buffer_is_empty
171 /// Returns true if the internal buffer is empty
172 pub fn buffer_is_empty(&self) -> bool {
173 self.start == self.end
174 }
175
176 //mp borrow_buffer
177 /// Borrow the data held in the [Reader]'s buffer.
178 pub fn borrow_buffer(&self) -> &[u8] {
179 &self.current[self.start..self.end]
180 }
181
182 //mp borrow_pos
183 /// Borrow the stream position of the next character to be returned
184 pub fn borrow_pos(&self) -> &StreamPosition {
185 &self.stream_pos
186 }
187
188 //mp borrow
189 /// Borrow the underlying stream
190 pub fn borrow(&self) -> &R {
191 &self.buf_reader
192 }
193
194 //mp borrow_mut
195 /// Borrow the underlying stream as a mutable reference
196 pub fn borrow_mut(&mut self) -> &mut R {
197 &mut self.buf_reader
198 }
199
200 //fi fetch_input
201 /// Fetch input from the underlying stream into the internal buffer,
202 /// moving valid data to the start of the buffer first if
203 /// required. This method should only be invoked if more data is
204 /// required; it is relatively code-heavy.
205 fn fetch_input(&mut self) -> Result<usize> {
206 if self.start>BUFFER_SIZE-BUFFER_SLACK {
207 // Move everything down by self.start
208 let n = self.end - self.start;
209 if n>0 {
210 for i in 0..n {
211 self.current[i] = self.current[self.start+i];
212 }
213 }
214 self.valid_end -= self.start;
215 self.start = 0; // == self.start - self.start
216 self.end = n; // == self.end - self.start
217 }
218 let n = self.buf_reader.read( &mut self.current[self.end..BUFFER_SIZE] )?;
219 self.end += n;
220 if n==0 && self.eof_on_no_data {
221 self.eof = true;
222 }
223 Ok(n)
224 }
225
226 //mp next_char
227 /// Return the next character from the stream, if one is available, or [EOF](Char::Eof).
228 ///
229 /// If there is no data - or not enough data - from the underlying stream, and the [Reader] is operating with the underlying stream *not* indicating EOF with a zero-byte read result, then [NoData](Char::NoData) is returned.
230 ///
231 /// # Errors
232 ///
233 /// May return [Error::MalformedUtf8] if the next bytes in the stream do not make a well-formed UTF8 character.
234 ///
235 /// May return [Error::IoError] if the underlying stream has an IO Error.
236 pub fn next_char(&mut self) -> Result<Char> {
237 if self.eof {
238 Ok(Char::Eof)
239 } else if self.start == self.end { // no data present, try reading data
240 if self.fetch_input()? == 0 {
241 Ok(Char::NoData)
242 } else {
243 self.next_char()
244 }
245 } else if self.start < self.valid_end { // there is valid UTF-8 data at buffer+self.start
246 let s = {
247 // std::str::from_utf8(&self.current[self.start..self.valid_end]).unwrap()
248 unsafe {
249 std::str::from_utf8_unchecked(&self.current[self.start..self.valid_end])
250 }
251 };
252 let ch = s.chars().next().unwrap();
253 let n = ch.len_utf8();
254 self.start += n;
255 self.stream_pos.move_by(n, ch);
256 Ok(Char::Char(ch))
257 } else { // there is data but it may or may not be valid
258 match std::str::from_utf8(&self.current[self.start..self.end]) {
259 Ok(_) => { // the data is valid, mark it and the return from there
260 self.valid_end = self.end;
261 self.next_char()
262 }
263 Err(e) => { // the data is not all valid
264 if e.valid_up_to()>0 { // some bytes form valid UTF-8 - mark them and return that data
265 self.valid_end = self.start+e.valid_up_to();
266 self.next_char()
267 } else { // no valid data - check it is just incomplete, or an actual error
268 match e.error_len() {
269 None => { // incomplete UTF-8 fetch more
270 match self.fetch_input()? {
271 0 => { // ... and eof reached when incomplete UTF8 is present
272 if self.eof {
273 Error::malformed_utf8(self.stream_pos, self.end-self.start)
274 } else {
275 Ok(Char::NoData)
276 }
277 }
278 _ => { // ... but got more data so try that!
279 self.next_char()
280 }
281 }
282 }
283 Some(n) => { // Bad UTF-8 with n bytes used
284 let r = Error::malformed_utf8(self.stream_pos, n);
285 self.stream_pos.move_on_bytes(n);
286 self.start += n;
287 r
288 },
289 }
290 }
291 },
292 }
293 }
294 }
295
296 //zz All done
297}
298
299
300//ip Iterator for Reader - iterate over characters
301//
302// allow missing doc code examples for this as it *has* an example but
303// rustdoc does not pick it up.
304#[allow(missing_doc_code_examples)]
305impl <'a, R:std::io::Read> Iterator for &'a mut Reader<R> {
306 // we will be counting with usize
307 type Item = Result<char>;
308
309 //mp next - return next character or None if end of file
310 fn next(&mut self) -> Option<Self::Item> {
311 match self.next_char() {
312 Ok(Char::Char(ch)) => Some(Ok(ch)),
313 Ok(_) => None,
314 Err(x) => Some(Err(x)),
315 }
316 }
317
318 //zz All done
319}