utf8_bufread/lib.rs
1#![feature(option_result_unwrap_unchecked)]
2//! This crate provides functions to read utf-8 text from any type implementing [`io::BufRead`]
3//! through a trait, [`BufRead`], without waiting for newline delimiters. These functions take
4//! advantage of buffering and either return `&`[`str`] or [`char`]s. Each has an associated
5//! iterator, some have an equivalent to a [`Map`] iterator that avoids allocation and cloning as
6//! well.
7//!
8//! # Quick Start
9//!
10//! The simplest way to read a file using this crate may be something along the following:
11//!
12//! ```
13//! use utf8_bufread::BufRead;
14//! use std::io::{Cursor, ErrorKind};
15//! use std::borrow::Cow;
16//!
17//! // Reader may be any type implementing io::BufRead
18//! // We'll just use a cursor wrapping a slice for this example
19//! let mut reader = Cursor::new("Löwe 老虎 Léopard");
20//! loop { // Loop until EOF
21//! match reader.read_str() {
22//! Ok(s) => {
23//! if s.is_empty() {
24//! break; // EOF
25//! }
26//! // Do something with `s` ...
27//! print!("{}", s);
28//! }
29//! Err(e) => {
30//! // We should try again if we get interrupted
31//! if e.kind() != ErrorKind::Interrupted {
32//! break;
33//! }
34//! }
35//! }
36//! }
37//! ```
38//!
39//! # Reading arbitrary-length string slices
40//!
41//! The [`read_str`] function returns a `&`[`str`] of arbitrary length (up to the reader's buffer
42//! capacity) read from the inner reader, without cloning data, unless a valid codepoint ends up
43//! cut at the end of the reader's buffer. Its associated iterator can be obtained by calling
44//! [`str_iter`], and since it involves cloning the data at each iteration, [`str_map`] is also
45//! provided.
46//!
47//! # Reading codepoints
48//!
49//! The [`read_char`] function returns a [`char`] read from the inner reader. Its associated
50//! iterator can be obtained by calling [`char_iter`].
51//!
52//! # Iterator types
53//!
54//! This crate provides several structs for several ways of iterating over the inner reader's data:
55//! - [`StrIter`] and [`CodepointIter`] clone the data on each iteration, but use an [`Rc`] to
56//! check if the returned [`String`] buffer is still used. If not, it is re-used to avoid
57//! re-allocating.
58//! ```
59//! use utf8_bufread::BufRead;
60//! use std::io::Cursor;
61//!
62//! let mut reader = Cursor::new("Löwe 老虎 Léopard");
63//! for s in reader.str_iter().filter_map(|r| r.ok()) {
64//! // Do something with s ...
65//! print!("{}", s);
66//! }
67//! ```
68//! - [`StrMap`] and [`CodepointMap`] allow having access to read data without cloning, but then it
69//! cannot be passed to further iterator adapters.
70//! ```
71//! use utf8_bufread::BufRead;
72//! use std::io::Cursor;
73//!
74//! let s = "Löwe 老虎 Léopard";
75//! let mut reader = Cursor::new(s);
76//! let count: usize = reader.str_map(|s| s.len()).filter_map(Result::ok).sum();
77//! println!("There is {} valid utf-8 bytes in {}", count, s);
78//! ```
79//! - [`CharIter`] is similar to [`StrIter`] and others, except it relies on [`char`]s implementing
80//! [`Copy`] and thus doesn't need a buffer nor the "`Rc` trick".
81//! ```
82//! use utf8_bufread::BufRead;
83//! use std::io::Cursor;
84//!
85//! let s = "Löwe 老虎 Léopard";
86//! let mut reader = Cursor::new(s);
87//! let count = reader.char_iter().filter_map(Result::ok).filter(|c| c.is_lowercase()).count();
88//! assert_eq!(count, 9);
89//! ```
90//!
91//! All these iterators may read data until EOF or an invalid codepoint is found. If valid
92//! codepoints are read from the inner reader, they *will* be returned before reporting an error.
93//! After encountering an error or EOF, they always return `None`. They always ignore any
94//! [`Interrupted`] error.
95//!
96//! [`read_str`]: self::BufRead::read_str
97//! [`str_iter`]: self::BufRead::str_iter
98//! [`str_map`]: self::BufRead::str_map
99//! [`read_char`]: self::BufRead::read_char
100//! [`char_iter`]: self::BufRead::char_iter
101//! [`Map`]: std::iter::Map
102//! [`Interrupted`]: std::io::ErrorKind::Interrupted
103
104#[deny(missing_crate_level_docs, missing_docs, missing_doc_code_examples)]
105mod error;
106
107use error::Result;
108use std::borrow::Cow;
109use std::io::{self, ErrorKind};
110use std::rc::Rc;
111use std::slice::from_raw_parts;
112use std::str::{from_utf8, from_utf8_unchecked, FromStr};
113
114pub use error::Error;
115
116/// A trait implemented for all types implementing [`io::BufRead`], providing functions to
117/// read utf-8 text streams without waiting for newline delimiters.
118///
119/// [`io::BufRead`]: std::io::BufRead
120///
121/// # Examples
122///
123/// ```
124/// use std::io::Cursor;
125/// use utf8_bufread::BufRead;
126///
127/// // Prints "I luv you too !"
128/// if Cursor::new("💖").read_str().map_or(false, |s| s == "💖") {
129/// println!("I luv you too !");
130/// }
131/// ```
132pub trait BufRead: io::BufRead {
133 /// Reads some bytes from the inner reader and returns a [`Cow`]`<&`[`str`]`>` of it referring
134 /// to all valid codepoints read, wrapped in an [`io::Result`].
135 ///
136 /// This function will read all bytes from the underlying stream until its buffer is full, an
137 /// invalid or incomplete codepoint is found, or EOF is found. Once found, all codepoints up
138 /// to, including the EOF (if found), but not including the invalid or incomplete codepoint
139 /// (if found), will be returned. This function may read an arbitrary number of byte, between 1
140 /// and this reader's buffer capacity (unless the buffer is not big enough to fit a unicode
141 /// codepoint).
142 ///
143 /// The returned reference points to this reader's actual buffer, meaning it borrows the
144 /// reader.
145 ///
146 /// A [`Cow`] is used to gracefully handle cases where a valid codepoint is cut by the end of
147 /// the buffer of this reader, and more bytes may need to be read from the inner reader to form
148 /// a hopefully valid codepoint. This function only allocates a new [`String`] and clones data
149 /// in that scenario. In worst case it happens once every two calls, allocating and cloning
150 /// 4 bytes every `c` bytes read, where `c` is this reader's buffer capacity.
151 ///
152 /// If this function returns [`Ok`]`("")`, the stream has reached EOF.
153 ///
154 /// # Errors
155 ///
156 /// If the internal call to [`fill_buf`] returns an [`io::Error`] or this function returns
157 /// immediately an [`Error`] wrapping the original error.
158 ///
159 /// If the first codepoint read from the inner reader is invalid, an [`Error`] wrapping the
160 /// original [`Utf8Error`] or [`FromUtf8Error`] is returned.
161 ///
162 /// If the codepoint is complete but invalid, the returned error will have a [`kind`] of
163 /// [`ErrorKind`]`::`[`InvalidData`]. If EOF was encountered before the end of a codepoint,
164 /// the error will have a [`kind`] of [`ErrorKind`]`::`[`UnexpectedEof`].
165 ///
166 /// The returned [`Error`] may contain a non-zero amount of "leftover" bytes (see
167 /// [`Error::leftovers`] for more info). When it is the case, it is guaranteed that the
168 /// following read operation will not return any of those bytes, nor "skip" bytes from this
169 /// reader.
170 ///
171 /// Note that if the buffer of this reader is less than 4 bytes long it may fail to read
172 /// complete codepoints and "spuriously" return the same error as when it unexpectedly
173 /// encounters EOF, since we're unable to load enough bytes to form a valid codepoint. We
174 /// cannot check the capacity of the buffer using the [`io::BufRead`] API only, it is then up
175 /// to the user to ensure this won't happen. *It should not happen unless you explicitly set
176 /// the capacity yourself*.
177 ///
178 /// # Examples
179 ///
180 /// This example simply reads from a stream and prints it to standard output.
181 ///
182 /// ```
183 /// use std::io::{Cursor, Error, ErrorKind};
184 /// use utf8_bufread::BufRead;
185 /// use std::borrow::Cow;
186 ///
187 /// // We could use any type implementing io::BufRead, we'll just use a cursor here
188 /// let mut reader = Cursor::new("Löwe 老虎 Léopard");
189 ///
190 /// loop {
191 /// match reader.read_str() {
192 /// Ok(s) => {
193 /// if s.is_empty() {
194 /// break; // EOF
195 /// }
196 /// print!("{}", s)
197 /// }
198 /// Err(e) => {
199 /// if ErrorKind::Interrupted != e.kind() {
200 /// // Ignore interrupted errors
201 /// eprintln!("{}", e);
202 /// }
203 /// }
204 /// }
205 /// }
206 /// ```
207 ///
208 /// [`kind`]: self::Error::kind
209 /// [`fill_buf`]: std::io::BufRead::fill_buf
210 /// [`Interrupted`]: std::io::ErrorKind::Interrupted
211 /// [`InvalidData`]: std::io::ErrorKind::InvalidData
212 /// [`UnexpectedEof`]: std::io::ErrorKind::UnexpectedEof
213 /// [`Utf8Error`]: std::str::Utf8Error
214 /// [`FromUtf8Error`]: std::string::FromUtf8Error
215 fn read_str(&mut self) -> Result<Cow<str>> {
216 // Fill the buffer from inner reader's data and get its content
217 let read_bytes = self.fill_buf()?;
218 let read_len = read_bytes.len();
219 if read_len == 0 {
220 return Ok(Cow::from(""));
221 }
222 let ptr = read_bytes.as_ptr();
223 // We attempt converting read bytes to utf8
224 match from_utf8(read_bytes) {
225 Ok(_) => {
226 self.consume(read_len);
227 // The call to `from_raw_parts` is safe, as:
228 // a. It is within the memory region of the reader's now filled buffer.
229 // b. Implicit lifetimes imply the reader is mutably borrowed for the lifetime of the
230 // returned str reference
231 // TODO: ask for review of point b. above
232 // The call to `from_utf8_unchecked` is safe as we just ran the validation on the same
233 // memory region above
234 Ok(Cow::from(unsafe {
235 from_utf8_unchecked(from_raw_parts(ptr, read_len))
236 }))
237 }
238 Err(e) => {
239 // If we have an error, we will first attempt to return all valid read bytes,
240 // putting the invalid or incomplete codepoint at the beginning of the buffer.
241 // This allows us to recover from reading up to a byte that isn't on a char
242 // boundary by reading the complete codepoint on the next call
243 let len = e.valid_up_to();
244 if len != 0 {
245 self.consume(len);
246 // This is safe, see `Utf8Error::valid_up_to(&self)` doc
247 Ok(Cow::from(unsafe {
248 from_utf8_unchecked(from_raw_parts(ptr, len))
249 }))
250 } else if read_len >= codepoint_length(read_bytes[0]) {
251 // If we cannot decode any valid utf8 byte from the buffer, it either means
252 // - We reached EOF with an incomplete codepoint, we should return an
253 // UnexpectedEof Error
254 // - There was a parse error earlier, and we read everything up to this
255 // point in a previous read call, there is two possible situations again:
256 // - There is more than 2 bytes following the first byte of the invalid
257 // slice, this means there truly is an invalid codepoint, we should
258 // return an Utf8Error
259 // - There is less than 4 bytes left in the buffer, meaning we may have
260 // an incomplete codepoint and need to read up to 3 bytes further.
261 // We know read_bytes is not empty
262 // We couldn't get a valid codepoint despite reading enough bytes
263 Err(Error::from(e))
264 } else {
265 // Not enough bytes read, we will try to read more bytes
266 // Consume the last bytes, so that the next call to `fill_buff` will read
267 // more bytes from the underlying stream
268 self.consume(read_len);
269 read_across_boundary(self, Vec::from(unsafe { from_raw_parts(ptr, read_len) }))
270 }
271 }
272 }
273 }
274
275 /// Reads 1 to 4 bytes from the inner reader and returns a [`Cow`]`<&`[`str`]`>` of it
276 /// referring to the valid codepoints read, wrapped in an [`io::Result`].
277 ///
278 /// This function will read bytes from the underlying stream until one codepoint is read, an
279 /// invalid or incomplete codepoint is found, or EOF is found.
280 ///
281 /// The returned reference points to this reader's actual buffer, meaning it borrows the
282 /// reader.
283 ///
284 /// A [`Cow`] is used to gracefully handle cases where a valid codepoint is cut by the end of
285 /// the buffer of this reader, and more bytes may need to be read from the inner reader to form
286 /// a hopefully valid codepoint. This function only allocates a new [`String`] and clones data
287 /// in that scenario. In worst case it allocates and clones 4 bytes every `c` bytes read,
288 /// where `c` is this reader's buffer capacity.
289 ///
290 /// If this function returns [`Ok`]`("")`, the stream has reached EOF.
291 ///
292 /// # Errors
293 ///
294 /// If the internal call to [`fill_buf`] returns an [`io::Error`] or this function returns
295 /// immediately an [`Error`] wrapping the original error.
296 ///
297 /// If the first codepoint read from the inner reader is invalid or incomplete, an [`Error`]
298 /// wrapping the original [`Utf8Error`] or [`FromUtf8Error`] is returned.
299 ///
300 /// If the codepoint is complete but invalid, the returned error will have a [`kind`] of
301 /// [`ErrorKind`]`::`[`InvalidData`]. If EOF was encountered before the end of a codepoint,
302 /// the error will have a [`kind`] of [`ErrorKind`]`::`[`UnexpectedEof`].
303 ///
304 /// The returned [`Error`] may contain a non-zero amount of "leftover" bytes (see
305 /// [`Error::leftovers`] for more info). When it is the case, it is guaranteed that the
306 /// following read operation will not return any of those bytes, nor "skip" bytes from this
307 /// reader.
308 ///
309 /// Note that if the buffer of this reader is less than 4 bytes long it may fail to read
310 /// complete codepoints and "spuriously" return the same error as when it unexpectedly
311 /// encounters EOF, since we're unable to load enough bytes to form a valid codepoint. We
312 /// cannot check the capacity of the buffer using the [`io::BufRead`] API only, it is then up
313 /// to the user to ensure this won't happen. *It should not happen unless you explicitly set
314 /// the capacity yourself*.
315 ///
316 /// # Examples
317 ///
318 /// This example simply reads from a stream and counts the number of `🏳` character.
319 ///
320 /// ```
321 /// use std::io::{Cursor, Error, ErrorKind};
322 /// use utf8_bufread::BufRead;
323 /// use std::borrow::Cow;
324 ///
325 /// // We could use any type implementing io::BufRead, we'll just use a cursor here
326 /// let mut reader = Cursor::new("Löwe 老虎 🏳Léopard");
327 /// let mut count = 0;
328 ///
329 /// loop {
330 /// match reader.read_codepoint() {
331 /// Ok(s) => {
332 /// if s.is_empty() {
333 /// break; // EOF
334 /// }
335 /// if s == "🏳" {
336 /// count += 1;
337 /// }
338 /// }
339 /// Err(e) => {
340 /// if ErrorKind::Interrupted != e.kind() {
341 /// // Ignore interrupted errors
342 /// eprintln!("{}", e);
343 /// }
344 /// }
345 /// }
346 /// }
347 /// assert_eq!(count, 1);
348 /// ```
349 #[doc(hidden)]
350 fn read_codepoint(&mut self) -> Result<Cow<str>> {
351 // Fill the buffer from inner reader's data and get its content
352 let read_bytes = self.fill_buf()?;
353 let read_len = read_bytes.len();
354 if read_len == 0 {
355 return Ok(Cow::from(""));
356 }
357 let ptr = read_bytes.as_ptr();
358 let len = codepoint_length(read_bytes[0]);
359 if read_len < len {
360 // Not enough bytes read, we will try to read more bytes
361 // Consume the last bytes, so that the next call to `fill_buff` will read
362 // more bytes from the underlying stream
363 self.consume(read_len);
364 read_across_boundary(self, Vec::from(unsafe { from_raw_parts(ptr, read_len) }))
365 } else {
366 match from_utf8(&read_bytes[..len]) {
367 Ok(_) => {
368 self.consume(len);
369 // The call to `from_raw_parts` is safe, as:
370 // a. It is within the memory region of the reader's now filled buffer.
371 // b. Implicit lifetimes imply the reader is mutably borrowed for the lifetime of the
372 // returned str reference
373 // TODO: ask for review of point b. above
374 // The call to `from_utf8_unchecked` is safe as we just ran the validation on the same
375 // memory region above
376 Ok(Cow::from(unsafe {
377 from_utf8_unchecked(from_raw_parts(ptr, len))
378 }))
379 }
380 Err(e) => Err(Error::from(e)),
381 }
382 }
383 }
384
385 /// Reads 1 to 4 bytes from the inner reader and returns the [`char`] read, wrapped in an
386 /// [`io::Result`].
387 ///
388 /// This function will read bytes from the underlying stream until one codepoint is read, an
389 /// invalid or incomplete codepoint is found, or EOF is found.
390 ///
391 /// If this function returns [`Ok`]`('\0')`, the stream has reached EOF.
392 ///
393 /// # Errors
394 ///
395 /// If the internal call to [`fill_buf`] returns an [`io::Error`] or this function returns
396 /// immediately an [`Error`] wrapping the original error.
397 ///
398 /// If the first codepoint read from the inner reader is invalid or incomplete, an [`Error`]
399 /// wrapping the original [`Utf8Error`] or [`FromUtf8Error`] is returned.
400 ///
401 /// If the codepoint is complete but invalid, the returned error will have a [`kind`] of
402 /// [`ErrorKind`]`::`[`InvalidData`]. If EOF was encountered before the end of a codepoint,
403 /// the error will have a [`kind`] of [`ErrorKind`]`::`[`UnexpectedEof`].
404 ///
405 /// The returned [`Error`] may contain a non-zero amount of "leftover" bytes (see
406 /// [`Error::leftovers`] for more info). When it is the case, it is guaranteed that the
407 /// following read operation will not return any of those bytes, nor "skip" bytes from this
408 /// reader.
409 ///
410 /// Note that if the buffer of this reader is less than 4 bytes long it may fail to read
411 /// complete codepoints and "spuriously" return the same error as when it unexpectedly
412 /// encounters EOF, since we're unable to load enough bytes to form a valid codepoint. We
413 /// cannot check the capacity of the buffer using the [`io::BufRead`] API only, it is then up
414 /// to the user to ensure this won't happen. *It should not happen unless you explicitly set
415 /// the capacity yourself*.
416 ///
417 /// # Examples
418 ///
419 /// This example simply reads from a stream and counts the number of lowercase characters
420 ///
421 /// ```
422 /// use std::io::{Cursor, Error, ErrorKind};
423 /// use utf8_bufread::BufRead;
424 /// use std::borrow::Cow;
425 ///
426 /// // We could use any type implementing io::BufRead, we'll just use a cursor here
427 /// let mut reader = Cursor::new("Löwe 老虎 Léopard");
428 /// let mut count = 0;
429 ///
430 /// loop {
431 /// match reader.read_char() {
432 /// Ok('\0') => break, // EOF
433 /// Ok(c) => {
434 /// if c.is_lowercase() {
435 /// count += 1;
436 /// }
437 /// }
438 /// Err(e) => {
439 /// if ErrorKind::Interrupted != e.kind() {
440 /// // Ignore interrupted errors
441 /// eprintln!("{}", e);
442 /// }
443 /// }
444 /// }
445 /// }
446 /// assert_eq!(count, 9);
447 /// ```
448 ///
449 /// [`fill_buf`]: std::io::BufRead::fill_buf
450 /// [`Interrupted`]: std::io::ErrorKind::Interrupted
451 /// [`InvalidData`]: std::io::ErrorKind::InvalidData
452 /// [`UnexpectedEof`]: std::io::ErrorKind::UnexpectedEof
453 /// [`Utf8Error`]: std::str::Utf8Error
454 /// [`FromUtf8Error`]: std::string::FromUtf8Error
455 /// [`kind`]: crate::error::Error::kind
456 fn read_char(&mut self) -> Result<char> {
457 // We guarantee that self.read_codepoint returns:
458 // - An empty string or
459 // - Exactly one valid codepoint
460 let c = self.read_codepoint()?;
461 if c.is_empty() {
462 return Ok('\0');
463 }
464 Ok(unsafe { char::from_str(c.as_ref()).unwrap_unchecked() })
465 }
466
467 /// Returns an iterator over string slices of this reader.
468 ///
469 /// It is equivalent to calling [`read_str`] in a loop, ignoring
470 /// [`ErrorKind`]`::`[`Interrupted`] errors, until EOF or the first error encountered.
471 ///
472 /// The iterator returned by this function will yield instances of
473 /// [`io::Result`]`<`[`Rc`]`<`[`String`]`>>`. We use the [`Rc`] to check while iterating if the
474 /// iterator is the only one holding a reference to it, avoiding allocating a new buffer if
475 /// that's the case.
476 ///
477 /// The iterator returned will yield at most one [`io::Error`]. Once an error is yielded, it
478 /// will only yield [`None`].
479 ///
480 /// # Examples
481 ///
482 /// This example simply reads from a string and prints it to standard output:
483 ///
484 /// ```
485 /// use std::io::Cursor;
486 /// use utf8_bufread::BufRead;
487 ///
488 /// // We could use any type implementing io::BufRead, we'll just use a cursor here
489 /// let mut reader = Cursor::new("Löwe 老虎 Léopard");
490 /// // We ignore any error, we know once we encounter one we can't read any further anyway
491 /// reader.str_iter().filter_map(Result::ok).for_each(|s| print!("{}", s));
492 /// ```
493 ///
494 /// [`read_str`]: self::BufRead::read_str
495 /// [`Interrupted`]: std::io::ErrorKind::Interrupted
496 fn str_iter(&mut self) -> StrIter<'_, Self> {
497 let default_cap = 8 * 1024;
498 StrIter {
499 reader: self,
500 buf: Rc::new(String::with_capacity(default_cap)),
501 default_cap,
502 ended: false,
503 }
504 }
505
506 /// Returns an iterator over codepoints of this reader.
507 ///
508 /// It is equivalent to calling [`read_codepoint`] in a loop, ignoring
509 /// [`ErrorKind`]`::`[`Interrupted`] errors, until EOF or the first error encountered.
510 ///
511 /// The iterator returned by this function will yield instances of
512 /// [`io::Result`]`<`[`Rc`]`<`[`String`]`>>`. We use the [`Rc`] to check while iterating if the
513 /// iterator is the only one holding a reference to it, avoiding allocating a new buffer if
514 /// that's the case.
515 ///
516 /// The iterator returned will yield at most one [`io::Error`]. Once an error is yielded, it
517 /// will only yield [`None`].
518 ///
519 /// # Examples
520 ///
521 /// This example simply reads from a stream and counts the number of `🏳` character.
522 ///
523 /// ```
524 /// use std::io::Cursor;
525 /// use utf8_bufread::BufRead;
526 ///
527 /// // We could use any type implementing io::BufRead, we'll just use a cursor here
528 /// let mut reader = Cursor::new("Löwe 老虎 🏳Léopard");
529 /// let count = reader.codepoint_iter()
530 /// .filter_map(Result::ok)
531 /// .filter(|s| s.as_ref() == "🏳")
532 /// .count();
533 /// assert_eq!(count, 1);
534 /// ```
535 #[doc(hidden)]
536 fn codepoint_iter(&mut self) -> CodepointIter<'_, Self> {
537 let default_cap = 4;
538 CodepointIter {
539 reader: self,
540 buf: Rc::new(String::with_capacity(default_cap)),
541 default_cap,
542 ended: false,
543 }
544 }
545
546 /// Returns an iterator over chars of this reader.
547 ///
548 /// It is equivalent to calling [`read_char`] in a loop, ignoring
549 /// [`ErrorKind`]`::`[`Interrupted`] errors, until EOF or the first error encountered.
550 ///
551 /// The iterator returned by this function will yield instances of
552 /// [`io::Result`]`<`[`char`]`>`.
553 ///
554 /// The iterator returned will yield at most one [`io::Error`]. Once an error is yielded, it
555 /// will only yield [`None`].
556 ///
557 /// # Examples
558 ///
559 /// This example simply reads from a stream, filtering out any whitespace:
560 ///
561 /// ```
562 /// use std::io::Cursor;
563 /// use utf8_bufread::BufRead;
564 ///
565 /// // We could use any type implementing io::BufRead, we'll just use a cursor here
566 /// let mut reader = Cursor::new("Löwe 老虎 Léopard");
567 /// let result: String = reader.char_iter()
568 /// .filter_map(Result::ok)
569 /// .filter(|c| !c.is_whitespace())
570 /// .collect();
571 /// assert_eq!(result.as_str(), "Löwe老虎Léopard");
572 /// ```
573 ///
574 /// [`read_char`]: self::BufRead::read_char
575 /// [`Interrupted`]: std::io::ErrorKind::Interrupted
576 fn char_iter(&mut self) -> CharIter<'_, Self> {
577 CharIter {
578 reader: self,
579 ended: false,
580 }
581 }
582
583 /// Returns an mapping iterator over string slices of this reader.
584 ///
585 /// It is equivalent to calling [`read_str`] in a loop, ignoring
586 /// [`ErrorKind`]`::`[`Interrupted`] errors, until EOF or the first error encountered.
587 ///
588 /// The iterator returned by this function will call `f` with instances of [`Cow`]`<`[`str`]`>`
589 /// as returned by [`read_str`], and yield instances of [`io::Result`]`<T>`. This may help
590 /// avoids the allocations and clonings [`str_iter`] does.
591 ///
592 /// The iterator returned will yield at most one [`io::Error`], and if one is yielded it will
593 /// always be the last item.
594 ///
595 /// # Examples
596 ///
597 /// This example simply reads from a stream and counts the number of bytes read:
598 ///
599 /// ```
600 /// use std::io::Cursor;
601 /// use utf8_bufread::BufRead;
602 ///
603 /// // We could use any type implementing io::BufRead, we'll just use a cursor here
604 /// let mut reader = Cursor::new("Löwe 老虎 Léopard");
605 /// let count: usize = reader.str_map(|s| s.len()).filter_map(Result::ok).sum();
606 /// assert_eq!(count, 21);
607 /// ```
608 ///
609 /// [`read_str`]: self::BufRead::read_str
610 /// [`str_iter`]: self::BufRead::str_iter
611 /// [`Interrupted`]: std::io::ErrorKind::Interrupted
612 fn str_map<F, T>(&mut self, f: F) -> StrMap<'_, Self, F>
613 where
614 F: FnMut(Cow<str>) -> T,
615 {
616 StrMap {
617 reader: self,
618 map: Rc::new(f),
619 ended: false,
620 }
621 }
622
623 /// Returns an mapping iterator over codepoints of this reader.
624 ///
625 /// It is equivalent to calling [`read_codepoint`] in a loop, ignoring
626 /// [`ErrorKind`]`::`[`Interrupted`] errors, until EOF or the first error encountered.
627 ///
628 /// The iterator returned by this function will call `f` with instances of [`Cow`]`<`[`str`]`>`
629 /// as returned by [`read_str`], and yield instances of [`io::Result`]`<T>`. This may help
630 /// avoids the allocations and clonings [`str_iter`] does.
631 ///
632 /// The iterator returned will yield at most one [`io::Error`], and if one is yielded it will
633 /// always be the last item.
634 ///
635 /// # Examples
636 ///
637 /// This example simply reads maps each codepoints to their length in bytes:
638 ///
639 /// ```
640 /// use std::io::Cursor;
641 /// use utf8_bufread::BufRead;
642 ///
643 /// // We could use any type implementing io::BufRead, we'll just use a cursor here
644 /// let mut reader = Cursor::new("Löwe 老虎 Léopard");
645 /// let lengths: Vec<_> = reader.codepoint_map(|s| s.len()).filter_map(Result::ok).collect();
646 /// assert_eq!(lengths.as_ref(), [1, 2, 1, 1, 1, 3, 3, 1, 1, 2, 1, 1, 1, 1, 1]);
647 /// ```
648 #[doc(hidden)]
649 fn codepoint_map<F, T>(&mut self, f: F) -> CodepointMap<'_, Self, F>
650 where
651 F: FnMut(Cow<str>) -> T,
652 {
653 CodepointMap {
654 reader: self,
655 map: Rc::new(f),
656 ended: false,
657 }
658 }
659}
660
661impl<R: io::BufRead> BufRead for R {}
662
663/// An iterator over string slices of an instance of [`io::BufRead`], created by [`str_iter`], see
664/// its documentation for more details.
665///
666/// [`str_iter`]: self::BufRead::str_iter
667pub struct StrIter<'r, R>
668where
669 R: ?Sized,
670{
671 reader: &'r mut R,
672 buf: Rc<String>,
673 default_cap: usize,
674 ended: bool,
675}
676
677impl<R> Iterator for StrIter<'_, R>
678where
679 R: io::BufRead,
680{
681 type Item = Result<Rc<String>>;
682
683 //noinspection DuplicatedCode
684 fn next(&mut self) -> Option<Self::Item> {
685 if self.ended {
686 return None;
687 }
688 let buf = match Rc::get_mut(&mut self.buf) {
689 None => {
690 self.buf = Rc::new(String::with_capacity(self.default_cap));
691 Rc::make_mut(&mut self.buf)
692 }
693 Some(buf) => {
694 buf.clear();
695 buf
696 }
697 };
698 loop {
699 match self.reader.read_str() {
700 Err(e) => {
701 if let ErrorKind::Interrupted = e.kind() {
702 continue;
703 }
704 self.ended = true;
705 break Some(Err(e));
706 }
707 Ok(s) => {
708 if s.is_empty() {
709 self.ended = true;
710 break None;
711 } else {
712 buf.push_str(s.as_ref());
713 break Some(Ok(Rc::clone(&self.buf)));
714 }
715 }
716 }
717 }
718 }
719}
720
721/// An iterator over string slices of an instance of [`io::BufRead`], created by
722/// [`codepoints_iter`], see its documentation for more details.
723///
724/// [`codepoints_iter`]: self::BufRead::codepoints_iter
725#[doc(hidden)]
726pub struct CodepointIter<'r, R>
727where
728 R: ?Sized,
729{
730 reader: &'r mut R,
731 buf: Rc<String>,
732 default_cap: usize,
733 ended: bool,
734}
735
736impl<R> Iterator for CodepointIter<'_, R>
737where
738 R: io::BufRead,
739{
740 type Item = Result<Rc<String>>;
741
742 //noinspection DuplicatedCode
743 fn next(&mut self) -> Option<Self::Item> {
744 if self.ended {
745 return None;
746 }
747 let buf = match Rc::get_mut(&mut self.buf) {
748 None => {
749 self.buf = Rc::new(String::with_capacity(self.default_cap));
750 Rc::make_mut(&mut self.buf)
751 }
752 Some(buf) => {
753 buf.clear();
754 buf
755 }
756 };
757 loop {
758 match self.reader.read_codepoint() {
759 Err(e) => {
760 if let ErrorKind::Interrupted = e.kind() {
761 continue;
762 }
763 self.ended = true;
764 break Some(Err(e));
765 }
766 Ok(s) => {
767 if s.is_empty() {
768 self.ended = true;
769 break None;
770 } else {
771 buf.push_str(s.as_ref());
772 break Some(Ok(Rc::clone(&self.buf)));
773 }
774 }
775 }
776 }
777 }
778}
779
780/// A mapping iterator over string slices of an instance of [`io::BufRead`], created by
781/// [`str_map`], see its documentation for more details.
782///
783/// [`str_map`]: self::BufRead::str_map
784pub struct StrMap<'r, R, F>
785where
786 R: ?Sized,
787{
788 reader: &'r mut R,
789 map: Rc<F>,
790 ended: bool,
791}
792
793impl<R, F, T> Iterator for StrMap<'_, R, F>
794where
795 R: io::BufRead,
796 F: FnMut(Cow<str>) -> T,
797{
798 type Item = Result<T>;
799
800 //noinspection DuplicatedCode
801 fn next(&mut self) -> Option<Self::Item> {
802 if self.ended {
803 return None;
804 }
805 loop {
806 match self.reader.read_str() {
807 Ok(s) => {
808 if s.is_empty() {
809 self.ended = true;
810 break None;
811 } else {
812 break Some(Ok((Rc::get_mut(&mut self.map)
813 .expect("MappingIter's mapping function cannot be shared !"))(
814 s
815 )));
816 }
817 }
818 Err(e) => {
819 if let ErrorKind::Interrupted = e.kind() {
820 continue;
821 }
822 self.ended = true;
823 break Some(Err(e));
824 }
825 }
826 }
827 }
828}
829
830/// A mapping iterator over codepoints of an instance of [`io::BufRead`], created by [`str_map`],
831/// see its documentation for more details.
832#[doc(hidden)]
833pub struct CodepointMap<'r, R, F>
834where
835 R: ?Sized,
836{
837 reader: &'r mut R,
838 map: Rc<F>,
839 ended: bool,
840}
841
842impl<R, F, T> Iterator for CodepointMap<'_, R, F>
843where
844 R: io::BufRead,
845 F: FnMut(Cow<str>) -> T,
846{
847 type Item = Result<T>;
848
849 //noinspection DuplicatedCode
850 fn next(&mut self) -> Option<Self::Item> {
851 if self.ended {
852 return None;
853 }
854 loop {
855 match self.reader.read_codepoint() {
856 Ok(s) => {
857 if s.is_empty() {
858 self.ended = true;
859 break None;
860 } else {
861 break Some(Ok((Rc::get_mut(&mut self.map)
862 .expect("MappingIter's mapping function cannot be shared !"))(
863 s
864 )));
865 }
866 }
867 Err(e) => {
868 if let ErrorKind::Interrupted = e.kind() {
869 continue;
870 }
871 self.ended = true;
872 break Some(Err(e));
873 }
874 }
875 }
876 }
877}
878
879/// An iterator over chars of an instance of [`io::BufRead`], created by [`char_iter`], see its
880/// documentation for more details.
881///
882/// [`char_iter`]: self::BufRead::char_iter
883pub struct CharIter<'r, R>
884where
885 R: ?Sized,
886{
887 reader: &'r mut R,
888 ended: bool,
889}
890
891impl<R> Iterator for CharIter<'_, R>
892where
893 R: io::BufRead,
894{
895 type Item = Result<char>;
896
897 fn next(&mut self) -> Option<Self::Item> {
898 if self.ended {
899 return None;
900 }
901 match self.reader.read_char() {
902 Ok(c) => {
903 if c == '\0' {
904 self.ended = true;
905 None
906 } else {
907 Some(Ok(c))
908 }
909 }
910 Err(e) => {
911 self.ended = true;
912 Some(Err(e))
913 }
914 }
915 }
916}
917
918fn read_across_boundary<R>(reader: &mut R, mut leftovers: Vec<u8>) -> Result<Cow<str>>
919where
920 R: io::BufRead + ?Sized,
921{
922 debug_assert!(!leftovers.is_empty());
923 // We know leftovers is not empty
924 let len = codepoint_length(leftovers[0]);
925 let first_read_len = leftovers.len();
926 debug_assert!(len > first_read_len);
927 let additional_len = (len - first_read_len) as usize;
928 // Let's try reading more bytes
929 let additional_bytes = &reader.fill_buf()?;
930 if additional_bytes.len() < additional_len {
931 // Not enough additional bytes, we reached EOF on an incomplete codepoint
932 return Err(Error::from(ErrorKind::UnexpectedEof).with_leftovers(leftovers));
933 }
934 // we know we have enough data
935 leftovers.extend_from_slice(&additional_bytes[..additional_len]);
936 reader.consume(additional_len);
937 match String::from_utf8(leftovers) {
938 Ok(s) => Ok(Cow::from(s)),
939 // We read enough bytes, they simply were not valid
940 Err(e) => Err(Error::from(e)),
941 }
942}
943
944#[inline]
945fn codepoint_length(x: u8) -> usize {
946 if x < 0x80 {
947 1
948 } else if x < 0xE0 {
949 2
950 } else if x < 0xF0 {
951 3
952 } else {
953 4
954 }
955}
956
957#[cfg(test)]
958mod read_str_tests {
959 use crate::BufRead;
960 use std::io::{BufReader, Cursor, ErrorKind};
961 use std::str::Utf8Error;
962 use std::string::FromUtf8Error;
963
964 #[test]
965 fn empty_read() {
966 let mut r = Cursor::new("");
967 let s = r.read_str();
968 assert!(s.is_ok());
969 let s = s.unwrap();
970 assert!(s.is_empty());
971 }
972
973 #[test]
974 fn invalid_in_buffer() {
975 let mut r = Cursor::new([0x9fu8, 0x92, 0x96, 0x0]);
976 let e = r.read_str();
977 assert!(e.is_err());
978 let e = e.unwrap_err();
979 assert_eq!(e.kind(), ErrorKind::InvalidData);
980 let e = e.into_inner_checked();
981 assert!(e.is_ok());
982 let e = e.unwrap();
983 assert!(e.is_some());
984 let e = e.unwrap();
985 assert!(e.is::<Utf8Error>());
986 }
987
988 #[test]
989 fn incomplete_in_buffer() {
990 let mut r = Cursor::new(&"💖".as_bytes()[..3]);
991 let e = r.read_str();
992 assert!(e.is_err());
993 let e = e.unwrap_err();
994 assert_eq!(e.kind(), ErrorKind::UnexpectedEof);
995 assert!(!e.leftovers().is_empty());
996 let e = e.into_inner_lossy();
997 assert!(e.is_none());
998 }
999
1000 #[test]
1001 fn invalid_across_boundary() {
1002 let mut r = BufReader::<&[u8]>::with_capacity(2, [0xffu8, 0x92, 0x96, 0x0].as_ref());
1003 let e = r.read_str();
1004 assert!(e.is_err());
1005 let e = e.unwrap_err();
1006 assert_eq!(e.kind(), ErrorKind::InvalidData);
1007 assert!(!e.leftovers().is_empty());
1008 let e = e.into_inner_lossy();
1009 assert!(e.is_some());
1010 let e = e.unwrap();
1011 assert!(e.is::<FromUtf8Error>());
1012 }
1013
1014 #[test]
1015 fn incomplete_across_boundary() {
1016 let mut r = BufReader::<&[u8]>::with_capacity(2, &"💖".as_bytes()[..3]);
1017 let e = r.read_str();
1018 assert!(e.is_err());
1019 let e = e.unwrap_err();
1020 assert_eq!(e.kind(), ErrorKind::UnexpectedEof);
1021
1022 let e = e.into_inner_lossy();
1023 assert!(e.is_none());
1024 }
1025
1026 #[test]
1027 fn complete_successful_read() {
1028 let mut r = Cursor::new("💖");
1029 let s = r.read_str();
1030 assert!(s.is_ok());
1031 let s = s.unwrap();
1032 assert_eq!(s, "💖");
1033 }
1034
1035 #[test]
1036 fn incomplete_successful_read() {
1037 let mut r = Cursor::new([0x6fu8, 0xa, 0x9f, 0x92, 0x96, 0x0]);
1038 let s = r.read_str();
1039 assert!(s.is_ok());
1040 let s = s.unwrap();
1041 assert_eq!(s, "o\n");
1042 }
1043
1044 #[test]
1045 fn read_across_boundary() {
1046 let mut r = BufReader::<&[u8]>::with_capacity(2, "💖".as_ref());
1047 let s = r.read_str();
1048 assert!(s.is_ok());
1049 let s = s.unwrap();
1050 assert_eq!(s, "💖");
1051 }
1052
1053 #[test]
1054 fn multi_codepoints_read() {
1055 let mut r = Cursor::new("foo💖bär€");
1056 let s = r.read_str();
1057 assert!(s.is_ok());
1058 let s = s.unwrap();
1059 assert_eq!(s, "foo💖bär€");
1060 let s = r.read_str();
1061 assert!(s.is_ok());
1062 let s = s.unwrap();
1063 assert_eq!(s, "");
1064 }
1065}
1066
1067#[cfg(test)]
1068mod buf_too_small_tests {
1069 macro_rules! buf_too_small_test {
1070 ($name:ident $cap:literal $input:literal: success) => {
1071 #[test]
1072 fn $name() {
1073 let mut r = BufReader::<&[u8]>::with_capacity($cap, $input.as_bytes());
1074 let mut call_count = 0;
1075 // Reading until EOF
1076 loop {
1077 let s = r.read_str();
1078 assert!(s.is_ok());
1079 let s = s.unwrap();
1080 if s.is_empty() {
1081 break;
1082 } else {
1083 call_count += 1;
1084 }
1085 }
1086 // Asserting we did not encounter EOF on the first call
1087 assert_ne!(call_count, 0);
1088 }
1089 };
1090 ($name:ident $cap:literal $input:literal: failure) => {
1091 #[test]
1092 fn $name() {
1093 let mut r = BufReader::<&[u8]>::with_capacity($cap, $input.as_bytes());
1094 // Reading until we fail
1095 loop {
1096 let e = r.read_str();
1097 match e {
1098 Ok(s) => {
1099 // We shouldn't reach EOF without failing a read
1100 assert!(!s.is_empty());
1101 }
1102 Err(e) => {
1103 assert_eq!(e.kind(), ErrorKind::UnexpectedEof);
1104 assert!(!e.leftovers().is_empty());
1105 let e = e.into_inner_lossy();
1106 assert!(e.is_none());
1107 break;
1108 }
1109 }
1110 }
1111 }
1112 };
1113 }
1114 mod buf_capacity_1 {
1115 use crate::BufRead;
1116 use std::io::{BufReader, ErrorKind};
1117
1118 buf_too_small_test!(codepoint_length_1_offset_0 1 "f": success);
1119 buf_too_small_test!(codepoint_length_2_offset_0 1 "ä": success);
1120 buf_too_small_test!(codepoint_length_3_offset_0 1 "€": failure);
1121 buf_too_small_test!(codepoint_length_4_offset_0 1 "💖": failure);
1122 }
1123
1124 mod buf_capacity_2 {
1125 use crate::BufRead;
1126 use std::io::{BufReader, ErrorKind};
1127
1128 buf_too_small_test!(codepoint_length_1_offset_0 2 "f": success);
1129 buf_too_small_test!(codepoint_length_2_offset_0 2 "ä": success);
1130 buf_too_small_test!(codepoint_length_2_offset_1 2 "xä": success);
1131 buf_too_small_test!(codepoint_length_3_offset_0 2 "€": success);
1132 buf_too_small_test!(codepoint_length_3_offset_1 2 "x€": success);
1133 buf_too_small_test!(codepoint_length_4_offset_0 2 "💖": success);
1134 buf_too_small_test!(codepoint_length_4_offset_1 2 "x💖": failure);
1135 }
1136
1137 mod buf_capacity_3 {
1138 use crate::BufRead;
1139 use std::io::BufReader;
1140
1141 buf_too_small_test!(codepoint_length_1_offset_0 3 "f": success);
1142 buf_too_small_test!(codepoint_length_2_offset_0 3 "ä": success);
1143 buf_too_small_test!(codepoint_length_2_offset_1 3 "xä": success);
1144 buf_too_small_test!(codepoint_length_3_offset_0 3 "€": success);
1145 buf_too_small_test!(codepoint_length_3_offset_1 3 "x€": success);
1146 buf_too_small_test!(codepoint_length_3_offset_2 3 "xx€": success);
1147 buf_too_small_test!(codepoint_length_4_offset_0 3 "💖": success);
1148 buf_too_small_test!(codepoint_length_4_offset_1 3 "x💖": success);
1149 buf_too_small_test!(codepoint_length_4_offset_2 3 "xx💖": success);
1150 }
1151}