utf8_bufread/
lib.rs

1#![feature(option_result_unwrap_unchecked)]
2//! This crate provides functions to read utf-8 text from any type implementing [`io::BufRead`]
3//! through a trait, [`BufRead`], without waiting for newline delimiters. These functions take
4//! advantage of buffering and either return `&`[`str`] or [`char`]s. Each has an associated
5//! iterator, some have an equivalent to a [`Map`] iterator that avoids allocation and cloning as
6//! well.
7//!
8//! # Quick Start
9//!
10//! The simplest way to read a file using this crate may be something along the following:
11//!
12//! ```
13//! use utf8_bufread::BufRead;
14//! use std::io::{Cursor, ErrorKind};
15//! use std::borrow::Cow;
16//!
17//! // Reader may be any type implementing io::BufRead
18//! // We'll just use a cursor wrapping a slice for this example
19//! let mut reader = Cursor::new("Löwe 老虎 Léopard");
20//! loop { // Loop until EOF
21//!     match reader.read_str() {
22//!         Ok(s) => {
23//!             if s.is_empty() {
24//!                 break; // EOF
25//!             }
26//!             // Do something with `s` ...
27//!             print!("{}", s);
28//!         }
29//!         Err(e) => {
30//!             // We should try again if we get interrupted
31//!             if e.kind() != ErrorKind::Interrupted {
32//!                 break;
33//!             }
34//!         }
35//!     }
36//! }
37//! ```
38//!
39//! # Reading arbitrary-length string slices
40//!
41//! The [`read_str`] function returns a `&`[`str`] of arbitrary length (up to the reader's buffer
42//! capacity) read from the inner reader, without cloning data, unless a valid codepoint ends up
43//! cut at the end of the reader's buffer. Its associated iterator can be obtained by calling
44//! [`str_iter`], and since it involves cloning the data at each iteration, [`str_map`] is also
45//! provided.
46//!
47//! # Reading codepoints
48//!
49//! The [`read_char`] function returns a [`char`] read from the inner reader. Its associated
50//! iterator can be obtained by calling [`char_iter`].
51//!
52//! # Iterator types
53//!
54//! This crate provides several structs for several ways of iterating over the inner reader's data:
55//! - [`StrIter`] and [`CodepointIter`] clone the data on each iteration, but use an [`Rc`] to
56//!   check if the returned [`String`] buffer is still used. If not, it is re-used to avoid
57//!   re-allocating.
58//!   ```
59//!   use utf8_bufread::BufRead;
60//!   use std::io::Cursor;
61//!
62//!   let mut reader = Cursor::new("Löwe 老虎 Léopard");
63//!   for s in reader.str_iter().filter_map(|r| r.ok()) {
64//!       // Do something with s ...
65//!       print!("{}", s);
66//!   }
67//!   ```
68//! - [`StrMap`] and [`CodepointMap`] allow having access to read data without cloning, but then it
69//!   cannot be passed to further iterator adapters.
70//!   ```
71//!   use utf8_bufread::BufRead;
72//!   use std::io::Cursor;
73//!
74//!   let s = "Löwe 老虎 Léopard";
75//!   let mut reader = Cursor::new(s);
76//!   let count: usize = reader.str_map(|s| s.len()).filter_map(Result::ok).sum();
77//!   println!("There is {} valid utf-8 bytes in {}", count, s);
78//!   ```
79//! - [`CharIter`] is similar to [`StrIter`] and others, except it relies on [`char`]s implementing
80//!   [`Copy`] and thus doesn't need a buffer nor the "`Rc` trick".
81//!   ```
82//!   use utf8_bufread::BufRead;
83//!   use std::io::Cursor;
84//!
85//!   let s = "Löwe 老虎 Léopard";
86//!   let mut reader = Cursor::new(s);
87//!   let count = reader.char_iter().filter_map(Result::ok).filter(|c| c.is_lowercase()).count();
88//!   assert_eq!(count, 9);
89//!   ```
90//!
91//! All these iterators may read data until EOF or an invalid codepoint is found. If valid
92//! codepoints are read from the inner reader, they *will* be returned before reporting an error.
93//! After encountering an error or EOF, they always return `None`. They always ignore any
94//! [`Interrupted`] error.
95//!
96//! [`read_str`]: self::BufRead::read_str
97//! [`str_iter`]: self::BufRead::str_iter
98//! [`str_map`]: self::BufRead::str_map
99//! [`read_char`]: self::BufRead::read_char
100//! [`char_iter`]: self::BufRead::char_iter
101//! [`Map`]: std::iter::Map
102//! [`Interrupted`]: std::io::ErrorKind::Interrupted
103
104#[deny(missing_crate_level_docs, missing_docs, missing_doc_code_examples)]
105mod error;
106
107use error::Result;
108use std::borrow::Cow;
109use std::io::{self, ErrorKind};
110use std::rc::Rc;
111use std::slice::from_raw_parts;
112use std::str::{from_utf8, from_utf8_unchecked, FromStr};
113
114pub use error::Error;
115
116/// A trait implemented for all types implementing [`io::BufRead`], providing  functions to
117/// read utf-8 text streams without waiting for newline delimiters.
118///
119/// [`io::BufRead`]: std::io::BufRead
120///
121/// # Examples
122///
123/// ```
124/// use std::io::Cursor;
125/// use utf8_bufread::BufRead;
126///
127/// // Prints "I luv you too !"
128/// if Cursor::new("💖").read_str().map_or(false, |s| s == "💖") {
129///     println!("I luv you too !");
130/// }
131/// ```
132pub trait BufRead: io::BufRead {
133    /// Reads some bytes from the inner reader and returns a [`Cow`]`<&`[`str`]`>` of it referring
134    /// to all valid codepoints read, wrapped in an [`io::Result`].
135    ///
136    /// This function will read all bytes from the underlying stream until its buffer is full, an
137    /// invalid or incomplete codepoint is found, or EOF is found. Once found, all codepoints up
138    /// to, including the EOF (if found), but not including the invalid or incomplete codepoint
139    /// (if found), will be returned. This function may read an arbitrary number of byte, between 1
140    /// and this reader's buffer capacity (unless the buffer is not big enough to fit a unicode
141    /// codepoint).
142    ///
143    /// The returned reference points to this reader's actual buffer, meaning it borrows the
144    /// reader.
145    ///
146    /// A [`Cow`] is used to gracefully handle cases where a valid codepoint is cut by the end of
147    /// the buffer of this reader, and more bytes may need to be read from the inner reader to form
148    /// a hopefully valid codepoint. This function only allocates a new [`String`] and clones data
149    /// in that scenario. In worst case it happens once every two calls, allocating and cloning
150    /// 4 bytes every `c` bytes read, where `c` is this reader's buffer capacity.
151    ///
152    /// If this function returns [`Ok`]`("")`, the stream has reached EOF.
153    ///
154    /// # Errors
155    ///
156    /// If the internal call to [`fill_buf`] returns an [`io::Error`] or this function returns
157    /// immediately an [`Error`] wrapping the original error.
158    ///
159    /// If the first codepoint read from the inner reader is invalid, an [`Error`] wrapping the
160    /// original [`Utf8Error`] or [`FromUtf8Error`] is returned.
161    ///
162    /// If the codepoint is complete but invalid, the returned error will have a [`kind`] of
163    /// [`ErrorKind`]`::`[`InvalidData`]. If EOF was encountered before the end of a codepoint,
164    /// the error will have a [`kind`] of [`ErrorKind`]`::`[`UnexpectedEof`].
165    ///
166    /// The returned [`Error`] may contain a non-zero amount of "leftover" bytes (see
167    /// [`Error::leftovers`] for more info). When it is the case, it is guaranteed that the
168    /// following read operation will not return any of those bytes, nor "skip" bytes from this
169    /// reader.
170    ///
171    /// Note that if the buffer of this reader is less than 4 bytes long it may fail to read
172    /// complete codepoints and "spuriously" return the same error as when it unexpectedly
173    /// encounters EOF, since we're unable to load enough bytes to form a valid codepoint. We
174    /// cannot check the capacity of the buffer using the [`io::BufRead`] API only, it is then up
175    /// to the user to ensure this won't happen. *It should not happen unless you explicitly set
176    /// the capacity yourself*.
177    ///
178    /// # Examples
179    ///
180    /// This example simply reads from a stream and prints it to standard output.
181    ///
182    /// ```
183    /// use std::io::{Cursor, Error, ErrorKind};
184    /// use utf8_bufread::BufRead;
185    /// use std::borrow::Cow;
186    ///
187    /// // We could use any type implementing io::BufRead, we'll just use a cursor here
188    /// let mut  reader = Cursor::new("Löwe 老虎 Léopard");
189    ///
190    /// loop {
191    ///     match reader.read_str() {
192    ///         Ok(s) => {
193    ///             if s.is_empty() {
194    ///                 break; // EOF
195    ///             }
196    ///             print!("{}", s)
197    ///         }
198    ///         Err(e) => {
199    ///             if ErrorKind::Interrupted != e.kind() {
200    ///                 // Ignore interrupted errors
201    ///                 eprintln!("{}", e);
202    ///             }
203    ///         }
204    ///     }
205    /// }
206    /// ```
207    ///
208    /// [`kind`]: self::Error::kind
209    /// [`fill_buf`]: std::io::BufRead::fill_buf
210    /// [`Interrupted`]: std::io::ErrorKind::Interrupted
211    /// [`InvalidData`]: std::io::ErrorKind::InvalidData
212    /// [`UnexpectedEof`]: std::io::ErrorKind::UnexpectedEof
213    /// [`Utf8Error`]: std::str::Utf8Error
214    /// [`FromUtf8Error`]: std::string::FromUtf8Error
215    fn read_str(&mut self) -> Result<Cow<str>> {
216        // Fill the buffer from inner reader's data and get its content
217        let read_bytes = self.fill_buf()?;
218        let read_len = read_bytes.len();
219        if read_len == 0 {
220            return Ok(Cow::from(""));
221        }
222        let ptr = read_bytes.as_ptr();
223        // We attempt converting read bytes to utf8
224        match from_utf8(read_bytes) {
225            Ok(_) => {
226                self.consume(read_len);
227                // The call to `from_raw_parts` is safe, as:
228                // a. It is within the memory region of the reader's now filled buffer.
229                // b. Implicit lifetimes imply the reader is mutably borrowed for the lifetime of the
230                //    returned str reference
231                // TODO: ask for review of point b. above
232                // The call to `from_utf8_unchecked` is safe as we just ran the validation on the same
233                // memory region above
234                Ok(Cow::from(unsafe {
235                    from_utf8_unchecked(from_raw_parts(ptr, read_len))
236                }))
237            }
238            Err(e) => {
239                // If we have an error, we will first attempt to return all valid read bytes,
240                // putting the invalid or incomplete codepoint at the beginning of the buffer.
241                // This allows us to recover from reading up to a byte that isn't on a char
242                // boundary by reading the complete codepoint on the next call
243                let len = e.valid_up_to();
244                if len != 0 {
245                    self.consume(len);
246                    // This is safe, see `Utf8Error::valid_up_to(&self)` doc
247                    Ok(Cow::from(unsafe {
248                        from_utf8_unchecked(from_raw_parts(ptr, len))
249                    }))
250                } else if read_len >= codepoint_length(read_bytes[0]) {
251                    // If we cannot decode any valid utf8 byte from the buffer, it either means
252                    // - We reached EOF with an incomplete codepoint, we should return an
253                    //   UnexpectedEof Error
254                    // - There was a parse error earlier, and we read everything up to this
255                    //   point in a previous read call, there is two possible situations again:
256                    //   - There is more than 2 bytes following the first byte of the invalid
257                    //     slice, this means there truly is an invalid codepoint, we should
258                    //     return an Utf8Error
259                    //   - There is less than 4 bytes left in the buffer, meaning we may have
260                    //     an incomplete codepoint and need to read up to 3 bytes further.
261                    // We know read_bytes is not empty
262                    // We couldn't get a valid codepoint despite reading enough bytes
263                    Err(Error::from(e))
264                } else {
265                    // Not enough bytes read, we will try to read more bytes
266                    // Consume the last bytes, so that the next call to `fill_buff` will read
267                    // more bytes from the underlying stream
268                    self.consume(read_len);
269                    read_across_boundary(self, Vec::from(unsafe { from_raw_parts(ptr, read_len) }))
270                }
271            }
272        }
273    }
274
275    /// Reads 1 to 4 bytes from the inner reader and returns a [`Cow`]`<&`[`str`]`>` of it
276    /// referring to the valid codepoints read, wrapped in an [`io::Result`].
277    ///
278    /// This function will read bytes from the underlying stream until one codepoint is read, an
279    /// invalid or incomplete codepoint is found, or EOF is found.
280    ///
281    /// The returned reference points to this reader's actual buffer, meaning it borrows the
282    /// reader.
283    ///
284    /// A [`Cow`] is used to gracefully handle cases where a valid codepoint is cut by the end of
285    /// the buffer of this reader, and more bytes may need to be read from the inner reader to form
286    /// a hopefully valid codepoint. This function only allocates a new [`String`] and clones data
287    /// in that scenario. In worst case it allocates and clones 4 bytes every `c` bytes read,
288    /// where `c` is this reader's buffer capacity.
289    ///
290    /// If this function returns [`Ok`]`("")`, the stream has reached EOF.
291    ///
292    /// # Errors
293    ///
294    /// If the internal call to [`fill_buf`] returns an [`io::Error`] or this function returns
295    /// immediately an [`Error`] wrapping the original error.
296    ///
297    /// If the first codepoint read from the inner reader is invalid or incomplete, an [`Error`]
298    /// wrapping the original [`Utf8Error`] or [`FromUtf8Error`] is returned.
299    ///
300    /// If the codepoint is complete but invalid, the returned error will have a [`kind`] of
301    /// [`ErrorKind`]`::`[`InvalidData`]. If EOF was encountered before the end of a codepoint,
302    /// the error will have a [`kind`] of [`ErrorKind`]`::`[`UnexpectedEof`].
303    ///
304    /// The returned [`Error`] may contain a non-zero amount of "leftover" bytes (see
305    /// [`Error::leftovers`] for more info). When it is the case, it is guaranteed that the
306    /// following read operation will not return any of those bytes, nor "skip" bytes from this
307    /// reader.
308    ///
309    /// Note that if the buffer of this reader is less than 4 bytes long it may fail to read
310    /// complete codepoints and "spuriously" return the same error as when it unexpectedly
311    /// encounters EOF, since we're unable to load enough bytes to form a valid codepoint. We
312    /// cannot check the capacity of the buffer using the [`io::BufRead`] API only, it is then up
313    /// to the user to ensure this won't happen. *It should not happen unless you explicitly set
314    /// the capacity yourself*.
315    ///
316    /// # Examples
317    ///
318    /// This example simply reads from a stream and counts the number of `🏳` character.
319    ///
320    /// ```
321    /// use std::io::{Cursor, Error, ErrorKind};
322    /// use utf8_bufread::BufRead;
323    /// use std::borrow::Cow;
324    ///
325    /// // We could use any type implementing io::BufRead, we'll just use a cursor here
326    /// let mut  reader = Cursor::new("Löwe 老虎 🏳Léopard");
327    /// let mut count = 0;
328    ///
329    /// loop {
330    ///     match reader.read_codepoint() {
331    ///         Ok(s) => {
332    ///             if s.is_empty() {
333    ///                 break; // EOF
334    ///             }
335    ///             if s == "🏳" {
336    ///                 count += 1;
337    ///             }
338    ///         }
339    ///         Err(e) => {
340    ///             if ErrorKind::Interrupted != e.kind() {
341    ///                 // Ignore interrupted errors
342    ///                 eprintln!("{}", e);
343    ///             }
344    ///         }
345    ///     }
346    /// }
347    /// assert_eq!(count, 1);
348    /// ```
349    #[doc(hidden)]
350    fn read_codepoint(&mut self) -> Result<Cow<str>> {
351        // Fill the buffer from inner reader's data and get its content
352        let read_bytes = self.fill_buf()?;
353        let read_len = read_bytes.len();
354        if read_len == 0 {
355            return Ok(Cow::from(""));
356        }
357        let ptr = read_bytes.as_ptr();
358        let len = codepoint_length(read_bytes[0]);
359        if read_len < len {
360            // Not enough bytes read, we will try to read more bytes
361            // Consume the last bytes, so that the next call to `fill_buff` will read
362            // more bytes from the underlying stream
363            self.consume(read_len);
364            read_across_boundary(self, Vec::from(unsafe { from_raw_parts(ptr, read_len) }))
365        } else {
366            match from_utf8(&read_bytes[..len]) {
367                Ok(_) => {
368                    self.consume(len);
369                    // The call to `from_raw_parts` is safe, as:
370                    // a. It is within the memory region of the reader's now filled buffer.
371                    // b. Implicit lifetimes imply the reader is mutably borrowed for the lifetime of the
372                    //    returned str reference
373                    // TODO: ask for review of point b. above
374                    // The call to `from_utf8_unchecked` is safe as we just ran the validation on the same
375                    // memory region above
376                    Ok(Cow::from(unsafe {
377                        from_utf8_unchecked(from_raw_parts(ptr, len))
378                    }))
379                }
380                Err(e) => Err(Error::from(e)),
381            }
382        }
383    }
384
385    /// Reads 1 to 4 bytes from the inner reader and returns the [`char`] read, wrapped in an
386    /// [`io::Result`].
387    ///
388    /// This function will read bytes from the underlying stream until one codepoint is read, an
389    /// invalid or incomplete codepoint is found, or EOF is found.
390    ///
391    /// If this function returns [`Ok`]`('\0')`, the stream has reached EOF.
392    ///
393    /// # Errors
394    ///
395    /// If the internal call to [`fill_buf`] returns an [`io::Error`] or this function returns
396    /// immediately an [`Error`] wrapping the original error.
397    ///
398    /// If the first codepoint read from the inner reader is invalid or incomplete, an [`Error`]
399    /// wrapping the original [`Utf8Error`] or [`FromUtf8Error`] is returned.
400    ///
401    /// If the codepoint is complete but invalid, the returned error will have a [`kind`] of
402    /// [`ErrorKind`]`::`[`InvalidData`]. If EOF was encountered before the end of a codepoint,
403    /// the error will have a [`kind`] of [`ErrorKind`]`::`[`UnexpectedEof`].
404    ///
405    /// The returned [`Error`] may contain a non-zero amount of "leftover" bytes (see
406    /// [`Error::leftovers`] for more info). When it is the case, it is guaranteed that the
407    /// following read operation will not return any of those bytes, nor "skip" bytes from this
408    /// reader.
409    ///
410    /// Note that if the buffer of this reader is less than 4 bytes long it may fail to read
411    /// complete codepoints and "spuriously" return the same error as when it unexpectedly
412    /// encounters EOF, since we're unable to load enough bytes to form a valid codepoint. We
413    /// cannot check the capacity of the buffer using the [`io::BufRead`] API only, it is then up
414    /// to the user to ensure this won't happen. *It should not happen unless you explicitly set
415    /// the capacity yourself*.
416    ///
417    /// # Examples
418    ///
419    /// This example simply reads from a stream and counts the number of lowercase characters
420    ///
421    /// ```
422    /// use std::io::{Cursor, Error, ErrorKind};
423    /// use utf8_bufread::BufRead;
424    /// use std::borrow::Cow;
425    ///
426    /// // We could use any type implementing io::BufRead, we'll just use a cursor here
427    /// let mut  reader = Cursor::new("Löwe 老虎 Léopard");
428    /// let mut count = 0;
429    ///
430    /// loop {
431    ///     match reader.read_char() {
432    ///         Ok('\0') => break, // EOF
433    ///         Ok(c) => {
434    ///             if c.is_lowercase() {
435    ///                 count += 1;
436    ///             }
437    ///         }
438    ///         Err(e) => {
439    ///             if ErrorKind::Interrupted != e.kind() {
440    ///                 // Ignore interrupted errors
441    ///                 eprintln!("{}", e);
442    ///             }
443    ///         }
444    ///     }
445    /// }
446    /// assert_eq!(count, 9);
447    /// ```
448    ///
449    /// [`fill_buf`]: std::io::BufRead::fill_buf
450    /// [`Interrupted`]: std::io::ErrorKind::Interrupted
451    /// [`InvalidData`]: std::io::ErrorKind::InvalidData
452    /// [`UnexpectedEof`]: std::io::ErrorKind::UnexpectedEof
453    /// [`Utf8Error`]: std::str::Utf8Error
454    /// [`FromUtf8Error`]: std::string::FromUtf8Error
455    /// [`kind`]: crate::error::Error::kind
456    fn read_char(&mut self) -> Result<char> {
457        // We guarantee that self.read_codepoint returns:
458        // - An empty string or
459        // - Exactly one valid codepoint
460        let c = self.read_codepoint()?;
461        if c.is_empty() {
462            return Ok('\0');
463        }
464        Ok(unsafe { char::from_str(c.as_ref()).unwrap_unchecked() })
465    }
466
467    /// Returns an iterator over string slices of this reader.
468    ///
469    /// It is equivalent to calling [`read_str`] in a loop, ignoring
470    /// [`ErrorKind`]`::`[`Interrupted`] errors, until EOF or the first error encountered.
471    ///
472    /// The iterator returned by this function will yield instances of
473    /// [`io::Result`]`<`[`Rc`]`<`[`String`]`>>`. We use the [`Rc`] to check while iterating if the
474    /// iterator is the only one holding a reference to it, avoiding allocating a new buffer if
475    /// that's the case.
476    ///
477    /// The iterator returned will yield at most one [`io::Error`]. Once an error is yielded, it
478    /// will only yield [`None`].
479    ///
480    /// # Examples
481    ///
482    /// This example simply reads from a string and prints it to standard output:
483    ///
484    /// ```
485    /// use std::io::Cursor;
486    /// use utf8_bufread::BufRead;
487    ///
488    /// // We could use any type implementing io::BufRead, we'll just use a cursor here
489    /// let mut  reader = Cursor::new("Löwe 老虎 Léopard");
490    /// // We ignore any error, we know once we encounter one we can't read any further anyway
491    /// reader.str_iter().filter_map(Result::ok).for_each(|s| print!("{}", s));
492    /// ```
493    ///
494    /// [`read_str`]: self::BufRead::read_str
495    /// [`Interrupted`]: std::io::ErrorKind::Interrupted
496    fn str_iter(&mut self) -> StrIter<'_, Self> {
497        let default_cap = 8 * 1024;
498        StrIter {
499            reader: self,
500            buf: Rc::new(String::with_capacity(default_cap)),
501            default_cap,
502            ended: false,
503        }
504    }
505
506    /// Returns an iterator over codepoints of this reader.
507    ///
508    /// It is equivalent to calling [`read_codepoint`] in a loop, ignoring
509    /// [`ErrorKind`]`::`[`Interrupted`] errors, until EOF or the first error encountered.
510    ///
511    /// The iterator returned by this function will yield instances of
512    /// [`io::Result`]`<`[`Rc`]`<`[`String`]`>>`. We use the [`Rc`] to check while iterating if the
513    /// iterator is the only one holding a reference to it, avoiding allocating a new buffer if
514    /// that's the case.
515    ///
516    /// The iterator returned will yield at most one [`io::Error`]. Once an error is yielded, it
517    /// will only yield [`None`].
518    ///
519    /// # Examples
520    ///
521    /// This example simply reads from a stream and counts the number of `🏳` character.
522    ///
523    /// ```
524    /// use std::io::Cursor;
525    /// use utf8_bufread::BufRead;
526    ///
527    /// // We could use any type implementing io::BufRead, we'll just use a cursor here
528    /// let mut  reader = Cursor::new("Löwe 老虎 🏳Léopard");
529    /// let count = reader.codepoint_iter()
530    ///     .filter_map(Result::ok)
531    ///     .filter(|s| s.as_ref() == "🏳")
532    ///     .count();
533    /// assert_eq!(count, 1);
534    /// ```
535    #[doc(hidden)]
536    fn codepoint_iter(&mut self) -> CodepointIter<'_, Self> {
537        let default_cap = 4;
538        CodepointIter {
539            reader: self,
540            buf: Rc::new(String::with_capacity(default_cap)),
541            default_cap,
542            ended: false,
543        }
544    }
545
546    /// Returns an iterator over chars of this reader.
547    ///
548    /// It is equivalent to calling [`read_char`] in a loop, ignoring
549    /// [`ErrorKind`]`::`[`Interrupted`] errors, until EOF or the first error encountered.
550    ///
551    /// The iterator returned by this function will yield instances of
552    /// [`io::Result`]`<`[`char`]`>`.
553    ///
554    /// The iterator returned will yield at most one [`io::Error`]. Once an error is yielded, it
555    /// will only yield [`None`].
556    ///
557    /// # Examples
558    ///
559    /// This example simply reads from a stream, filtering out any whitespace:
560    ///
561    /// ```
562    /// use std::io::Cursor;
563    /// use utf8_bufread::BufRead;
564    ///
565    /// // We could use any type implementing io::BufRead, we'll just use a cursor here
566    /// let mut  reader = Cursor::new("Löwe 老虎 Léopard");
567    /// let result: String = reader.char_iter()
568    ///     .filter_map(Result::ok)
569    ///     .filter(|c| !c.is_whitespace())
570    ///     .collect();
571    /// assert_eq!(result.as_str(), "Löwe老虎Léopard");
572    /// ```
573    ///
574    /// [`read_char`]: self::BufRead::read_char
575    /// [`Interrupted`]: std::io::ErrorKind::Interrupted
576    fn char_iter(&mut self) -> CharIter<'_, Self> {
577        CharIter {
578            reader: self,
579            ended: false,
580        }
581    }
582
583    /// Returns an mapping iterator over string slices of this reader.
584    ///
585    /// It is equivalent to calling [`read_str`] in a loop, ignoring
586    /// [`ErrorKind`]`::`[`Interrupted`] errors, until EOF or the first error encountered.
587    ///
588    /// The iterator returned by this function will call `f` with instances of [`Cow`]`<`[`str`]`>`
589    /// as returned by [`read_str`], and yield instances of [`io::Result`]`<T>`. This may help
590    /// avoids the allocations and clonings [`str_iter`] does.
591    ///
592    /// The iterator returned will yield at most one [`io::Error`], and if one is yielded it will
593    /// always be the last item.
594    ///
595    /// # Examples
596    ///
597    /// This example simply reads from a stream and counts the number of bytes read:
598    ///
599    /// ```
600    /// use std::io::Cursor;
601    /// use utf8_bufread::BufRead;
602    ///
603    /// // We could use any type implementing io::BufRead, we'll just use a cursor here
604    /// let mut  reader = Cursor::new("Löwe 老虎 Léopard");
605    /// let count: usize = reader.str_map(|s| s.len()).filter_map(Result::ok).sum();
606    /// assert_eq!(count, 21);
607    /// ```
608    ///
609    /// [`read_str`]: self::BufRead::read_str
610    /// [`str_iter`]: self::BufRead::str_iter
611    /// [`Interrupted`]: std::io::ErrorKind::Interrupted
612    fn str_map<F, T>(&mut self, f: F) -> StrMap<'_, Self, F>
613    where
614        F: FnMut(Cow<str>) -> T,
615    {
616        StrMap {
617            reader: self,
618            map: Rc::new(f),
619            ended: false,
620        }
621    }
622
623    /// Returns an mapping iterator over codepoints of this reader.
624    ///
625    /// It is equivalent to calling [`read_codepoint`] in a loop, ignoring
626    /// [`ErrorKind`]`::`[`Interrupted`] errors, until EOF or the first error encountered.
627    ///
628    /// The iterator returned by this function will call `f` with instances of [`Cow`]`<`[`str`]`>`
629    /// as returned by [`read_str`], and yield instances of [`io::Result`]`<T>`. This may help
630    /// avoids the allocations and clonings [`str_iter`] does.
631    ///
632    /// The iterator returned will yield at most one [`io::Error`], and if one is yielded it will
633    /// always be the last item.
634    ///
635    /// # Examples
636    ///
637    /// This example simply reads maps each codepoints to their length in bytes:
638    ///
639    /// ```
640    /// use std::io::Cursor;
641    /// use utf8_bufread::BufRead;
642    ///
643    /// // We could use any type implementing io::BufRead, we'll just use a cursor here
644    /// let mut  reader = Cursor::new("Löwe 老虎 Léopard");
645    /// let lengths: Vec<_> = reader.codepoint_map(|s| s.len()).filter_map(Result::ok).collect();
646    /// assert_eq!(lengths.as_ref(), [1, 2, 1, 1, 1, 3, 3, 1, 1, 2, 1, 1, 1, 1, 1]);
647    /// ```
648    #[doc(hidden)]
649    fn codepoint_map<F, T>(&mut self, f: F) -> CodepointMap<'_, Self, F>
650    where
651        F: FnMut(Cow<str>) -> T,
652    {
653        CodepointMap {
654            reader: self,
655            map: Rc::new(f),
656            ended: false,
657        }
658    }
659}
660
661impl<R: io::BufRead> BufRead for R {}
662
663/// An iterator over string slices of an instance of [`io::BufRead`], created by [`str_iter`], see
664/// its documentation for more details.
665///
666/// [`str_iter`]: self::BufRead::str_iter
667pub struct StrIter<'r, R>
668where
669    R: ?Sized,
670{
671    reader: &'r mut R,
672    buf: Rc<String>,
673    default_cap: usize,
674    ended: bool,
675}
676
677impl<R> Iterator for StrIter<'_, R>
678where
679    R: io::BufRead,
680{
681    type Item = Result<Rc<String>>;
682
683    //noinspection DuplicatedCode
684    fn next(&mut self) -> Option<Self::Item> {
685        if self.ended {
686            return None;
687        }
688        let buf = match Rc::get_mut(&mut self.buf) {
689            None => {
690                self.buf = Rc::new(String::with_capacity(self.default_cap));
691                Rc::make_mut(&mut self.buf)
692            }
693            Some(buf) => {
694                buf.clear();
695                buf
696            }
697        };
698        loop {
699            match self.reader.read_str() {
700                Err(e) => {
701                    if let ErrorKind::Interrupted = e.kind() {
702                        continue;
703                    }
704                    self.ended = true;
705                    break Some(Err(e));
706                }
707                Ok(s) => {
708                    if s.is_empty() {
709                        self.ended = true;
710                        break None;
711                    } else {
712                        buf.push_str(s.as_ref());
713                        break Some(Ok(Rc::clone(&self.buf)));
714                    }
715                }
716            }
717        }
718    }
719}
720
721/// An iterator over string slices of an instance of [`io::BufRead`], created by
722/// [`codepoints_iter`], see its documentation for more details.
723///
724/// [`codepoints_iter`]: self::BufRead::codepoints_iter
725#[doc(hidden)]
726pub struct CodepointIter<'r, R>
727where
728    R: ?Sized,
729{
730    reader: &'r mut R,
731    buf: Rc<String>,
732    default_cap: usize,
733    ended: bool,
734}
735
736impl<R> Iterator for CodepointIter<'_, R>
737where
738    R: io::BufRead,
739{
740    type Item = Result<Rc<String>>;
741
742    //noinspection DuplicatedCode
743    fn next(&mut self) -> Option<Self::Item> {
744        if self.ended {
745            return None;
746        }
747        let buf = match Rc::get_mut(&mut self.buf) {
748            None => {
749                self.buf = Rc::new(String::with_capacity(self.default_cap));
750                Rc::make_mut(&mut self.buf)
751            }
752            Some(buf) => {
753                buf.clear();
754                buf
755            }
756        };
757        loop {
758            match self.reader.read_codepoint() {
759                Err(e) => {
760                    if let ErrorKind::Interrupted = e.kind() {
761                        continue;
762                    }
763                    self.ended = true;
764                    break Some(Err(e));
765                }
766                Ok(s) => {
767                    if s.is_empty() {
768                        self.ended = true;
769                        break None;
770                    } else {
771                        buf.push_str(s.as_ref());
772                        break Some(Ok(Rc::clone(&self.buf)));
773                    }
774                }
775            }
776        }
777    }
778}
779
780/// A mapping iterator over string slices of an instance of [`io::BufRead`], created by
781/// [`str_map`], see its documentation for more details.
782///
783/// [`str_map`]: self::BufRead::str_map
784pub struct StrMap<'r, R, F>
785where
786    R: ?Sized,
787{
788    reader: &'r mut R,
789    map: Rc<F>,
790    ended: bool,
791}
792
793impl<R, F, T> Iterator for StrMap<'_, R, F>
794where
795    R: io::BufRead,
796    F: FnMut(Cow<str>) -> T,
797{
798    type Item = Result<T>;
799
800    //noinspection DuplicatedCode
801    fn next(&mut self) -> Option<Self::Item> {
802        if self.ended {
803            return None;
804        }
805        loop {
806            match self.reader.read_str() {
807                Ok(s) => {
808                    if s.is_empty() {
809                        self.ended = true;
810                        break None;
811                    } else {
812                        break Some(Ok((Rc::get_mut(&mut self.map)
813                            .expect("MappingIter's mapping function cannot be shared !"))(
814                            s
815                        )));
816                    }
817                }
818                Err(e) => {
819                    if let ErrorKind::Interrupted = e.kind() {
820                        continue;
821                    }
822                    self.ended = true;
823                    break Some(Err(e));
824                }
825            }
826        }
827    }
828}
829
830/// A mapping iterator over codepoints of an instance of [`io::BufRead`], created by [`str_map`],
831/// see its documentation for more details.
832#[doc(hidden)]
833pub struct CodepointMap<'r, R, F>
834where
835    R: ?Sized,
836{
837    reader: &'r mut R,
838    map: Rc<F>,
839    ended: bool,
840}
841
842impl<R, F, T> Iterator for CodepointMap<'_, R, F>
843where
844    R: io::BufRead,
845    F: FnMut(Cow<str>) -> T,
846{
847    type Item = Result<T>;
848
849    //noinspection DuplicatedCode
850    fn next(&mut self) -> Option<Self::Item> {
851        if self.ended {
852            return None;
853        }
854        loop {
855            match self.reader.read_codepoint() {
856                Ok(s) => {
857                    if s.is_empty() {
858                        self.ended = true;
859                        break None;
860                    } else {
861                        break Some(Ok((Rc::get_mut(&mut self.map)
862                            .expect("MappingIter's mapping function cannot be shared !"))(
863                            s
864                        )));
865                    }
866                }
867                Err(e) => {
868                    if let ErrorKind::Interrupted = e.kind() {
869                        continue;
870                    }
871                    self.ended = true;
872                    break Some(Err(e));
873                }
874            }
875        }
876    }
877}
878
879/// An iterator over chars of an instance of [`io::BufRead`], created by [`char_iter`], see its
880/// documentation for more details.
881///
882/// [`char_iter`]: self::BufRead::char_iter
883pub struct CharIter<'r, R>
884where
885    R: ?Sized,
886{
887    reader: &'r mut R,
888    ended: bool,
889}
890
891impl<R> Iterator for CharIter<'_, R>
892where
893    R: io::BufRead,
894{
895    type Item = Result<char>;
896
897    fn next(&mut self) -> Option<Self::Item> {
898        if self.ended {
899            return None;
900        }
901        match self.reader.read_char() {
902            Ok(c) => {
903                if c == '\0' {
904                    self.ended = true;
905                    None
906                } else {
907                    Some(Ok(c))
908                }
909            }
910            Err(e) => {
911                self.ended = true;
912                Some(Err(e))
913            }
914        }
915    }
916}
917
918fn read_across_boundary<R>(reader: &mut R, mut leftovers: Vec<u8>) -> Result<Cow<str>>
919where
920    R: io::BufRead + ?Sized,
921{
922    debug_assert!(!leftovers.is_empty());
923    // We know leftovers is not empty
924    let len = codepoint_length(leftovers[0]);
925    let first_read_len = leftovers.len();
926    debug_assert!(len > first_read_len);
927    let additional_len = (len - first_read_len) as usize;
928    // Let's try reading more bytes
929    let additional_bytes = &reader.fill_buf()?;
930    if additional_bytes.len() < additional_len {
931        // Not enough additional bytes, we reached EOF on an incomplete codepoint
932        return Err(Error::from(ErrorKind::UnexpectedEof).with_leftovers(leftovers));
933    }
934    // we know we have enough data
935    leftovers.extend_from_slice(&additional_bytes[..additional_len]);
936    reader.consume(additional_len);
937    match String::from_utf8(leftovers) {
938        Ok(s) => Ok(Cow::from(s)),
939        // We read enough bytes, they simply were not valid
940        Err(e) => Err(Error::from(e)),
941    }
942}
943
944#[inline]
945fn codepoint_length(x: u8) -> usize {
946    if x < 0x80 {
947        1
948    } else if x < 0xE0 {
949        2
950    } else if x < 0xF0 {
951        3
952    } else {
953        4
954    }
955}
956
957#[cfg(test)]
958mod read_str_tests {
959    use crate::BufRead;
960    use std::io::{BufReader, Cursor, ErrorKind};
961    use std::str::Utf8Error;
962    use std::string::FromUtf8Error;
963
964    #[test]
965    fn empty_read() {
966        let mut r = Cursor::new("");
967        let s = r.read_str();
968        assert!(s.is_ok());
969        let s = s.unwrap();
970        assert!(s.is_empty());
971    }
972
973    #[test]
974    fn invalid_in_buffer() {
975        let mut r = Cursor::new([0x9fu8, 0x92, 0x96, 0x0]);
976        let e = r.read_str();
977        assert!(e.is_err());
978        let e = e.unwrap_err();
979        assert_eq!(e.kind(), ErrorKind::InvalidData);
980        let e = e.into_inner_checked();
981        assert!(e.is_ok());
982        let e = e.unwrap();
983        assert!(e.is_some());
984        let e = e.unwrap();
985        assert!(e.is::<Utf8Error>());
986    }
987
988    #[test]
989    fn incomplete_in_buffer() {
990        let mut r = Cursor::new(&"💖".as_bytes()[..3]);
991        let e = r.read_str();
992        assert!(e.is_err());
993        let e = e.unwrap_err();
994        assert_eq!(e.kind(), ErrorKind::UnexpectedEof);
995        assert!(!e.leftovers().is_empty());
996        let e = e.into_inner_lossy();
997        assert!(e.is_none());
998    }
999
1000    #[test]
1001    fn invalid_across_boundary() {
1002        let mut r = BufReader::<&[u8]>::with_capacity(2, [0xffu8, 0x92, 0x96, 0x0].as_ref());
1003        let e = r.read_str();
1004        assert!(e.is_err());
1005        let e = e.unwrap_err();
1006        assert_eq!(e.kind(), ErrorKind::InvalidData);
1007        assert!(!e.leftovers().is_empty());
1008        let e = e.into_inner_lossy();
1009        assert!(e.is_some());
1010        let e = e.unwrap();
1011        assert!(e.is::<FromUtf8Error>());
1012    }
1013
1014    #[test]
1015    fn incomplete_across_boundary() {
1016        let mut r = BufReader::<&[u8]>::with_capacity(2, &"💖".as_bytes()[..3]);
1017        let e = r.read_str();
1018        assert!(e.is_err());
1019        let e = e.unwrap_err();
1020        assert_eq!(e.kind(), ErrorKind::UnexpectedEof);
1021
1022        let e = e.into_inner_lossy();
1023        assert!(e.is_none());
1024    }
1025
1026    #[test]
1027    fn complete_successful_read() {
1028        let mut r = Cursor::new("💖");
1029        let s = r.read_str();
1030        assert!(s.is_ok());
1031        let s = s.unwrap();
1032        assert_eq!(s, "💖");
1033    }
1034
1035    #[test]
1036    fn incomplete_successful_read() {
1037        let mut r = Cursor::new([0x6fu8, 0xa, 0x9f, 0x92, 0x96, 0x0]);
1038        let s = r.read_str();
1039        assert!(s.is_ok());
1040        let s = s.unwrap();
1041        assert_eq!(s, "o\n");
1042    }
1043
1044    #[test]
1045    fn read_across_boundary() {
1046        let mut r = BufReader::<&[u8]>::with_capacity(2, "💖".as_ref());
1047        let s = r.read_str();
1048        assert!(s.is_ok());
1049        let s = s.unwrap();
1050        assert_eq!(s, "💖");
1051    }
1052
1053    #[test]
1054    fn multi_codepoints_read() {
1055        let mut r = Cursor::new("foo💖bär€");
1056        let s = r.read_str();
1057        assert!(s.is_ok());
1058        let s = s.unwrap();
1059        assert_eq!(s, "foo💖bär€");
1060        let s = r.read_str();
1061        assert!(s.is_ok());
1062        let s = s.unwrap();
1063        assert_eq!(s, "");
1064    }
1065}
1066
1067#[cfg(test)]
1068mod buf_too_small_tests {
1069    macro_rules! buf_too_small_test {
1070        ($name:ident $cap:literal $input:literal: success) => {
1071            #[test]
1072            fn $name() {
1073                let mut r = BufReader::<&[u8]>::with_capacity($cap, $input.as_bytes());
1074                let mut call_count = 0;
1075                // Reading until EOF
1076                loop {
1077                    let s = r.read_str();
1078                    assert!(s.is_ok());
1079                    let s = s.unwrap();
1080                    if s.is_empty() {
1081                        break;
1082                    } else {
1083                        call_count += 1;
1084                    }
1085                }
1086                // Asserting we did not encounter EOF on the first call
1087                assert_ne!(call_count, 0);
1088            }
1089        };
1090        ($name:ident $cap:literal $input:literal: failure) => {
1091            #[test]
1092            fn $name() {
1093                let mut r = BufReader::<&[u8]>::with_capacity($cap, $input.as_bytes());
1094                // Reading until we fail
1095                loop {
1096                    let e = r.read_str();
1097                    match e {
1098                        Ok(s) => {
1099                            // We shouldn't reach EOF without failing a read
1100                            assert!(!s.is_empty());
1101                        }
1102                        Err(e) => {
1103                            assert_eq!(e.kind(), ErrorKind::UnexpectedEof);
1104                            assert!(!e.leftovers().is_empty());
1105                            let e = e.into_inner_lossy();
1106                            assert!(e.is_none());
1107                            break;
1108                        }
1109                    }
1110                }
1111            }
1112        };
1113    }
1114    mod buf_capacity_1 {
1115        use crate::BufRead;
1116        use std::io::{BufReader, ErrorKind};
1117
1118        buf_too_small_test!(codepoint_length_1_offset_0 1 "f": success);
1119        buf_too_small_test!(codepoint_length_2_offset_0 1 "ä": success);
1120        buf_too_small_test!(codepoint_length_3_offset_0 1 "€": failure);
1121        buf_too_small_test!(codepoint_length_4_offset_0 1 "💖": failure);
1122    }
1123
1124    mod buf_capacity_2 {
1125        use crate::BufRead;
1126        use std::io::{BufReader, ErrorKind};
1127
1128        buf_too_small_test!(codepoint_length_1_offset_0 2 "f": success);
1129        buf_too_small_test!(codepoint_length_2_offset_0 2 "ä": success);
1130        buf_too_small_test!(codepoint_length_2_offset_1 2 "xä": success);
1131        buf_too_small_test!(codepoint_length_3_offset_0 2 "€": success);
1132        buf_too_small_test!(codepoint_length_3_offset_1 2 "x€": success);
1133        buf_too_small_test!(codepoint_length_4_offset_0 2 "💖": success);
1134        buf_too_small_test!(codepoint_length_4_offset_1 2 "x💖": failure);
1135    }
1136
1137    mod buf_capacity_3 {
1138        use crate::BufRead;
1139        use std::io::BufReader;
1140
1141        buf_too_small_test!(codepoint_length_1_offset_0 3 "f": success);
1142        buf_too_small_test!(codepoint_length_2_offset_0 3 "ä": success);
1143        buf_too_small_test!(codepoint_length_2_offset_1 3 "xä": success);
1144        buf_too_small_test!(codepoint_length_3_offset_0 3 "€": success);
1145        buf_too_small_test!(codepoint_length_3_offset_1 3 "x€": success);
1146        buf_too_small_test!(codepoint_length_3_offset_2 3 "xx€": success);
1147        buf_too_small_test!(codepoint_length_4_offset_0 3 "💖": success);
1148        buf_too_small_test!(codepoint_length_4_offset_1 3 "x💖": success);
1149        buf_too_small_test!(codepoint_length_4_offset_2 3 "xx💖": success);
1150    }
1151}
utf8_bufread/lib.rs

utf8_bufread/
lib.rs