async_read_super_ext/
lib.rs

1mod utf8_boundaries;
2
3use tokio::io::AsyncBufRead;
4
5use self::utf8_boundaries::{Utf8BoundariesLossy, read_utf8_boundaries_lossy};
6
7pub trait AsyncReadSuperExt: AsyncBufRead {
8    /// Reads data from the async reader while respecting UTF-8 character boundaries.
9    ///
10    /// This method reads data from the underlying async reader and ensures that the output
11    /// buffer contains only valid UTF-8 sequences. Any invalid UTF-8 bytes are replaced
12    /// with Unicode replacement characters (`U+FFFD`).
13    ///
14    /// # Features
15    ///
16    /// - **UTF-8 Boundary Awareness**: Handles incomplete UTF-8 sequences that span across
17    ///   multiple read operations by buffering partial characters.
18    /// - **Lossy Conversion**: Invalid UTF-8 bytes are replaced with replacement characters
19    ///   rather than causing errors.
20    /// - **Efficient Processing**: Valid UTF-8 data is processed without additional copying
21    ///   when possible.
22    ///
23    /// # Arguments
24    ///
25    /// * `buf` - A mutable reference to a `Vec<u8>` where the valid UTF-8 data will be written.
26    ///   The buffer will be extended with new data, not replaced.
27    ///
28    /// # Returns
29    ///
30    /// Returns a future that resolves to `io::Result<usize>` where the `usize` indicates
31    /// the number of bytes written to the buffer. A return value of `0` indicates EOF.
32    ///
33    /// # Behavior with Invalid UTF-8
34    ///
35    /// - **Invalid sequences**: Each invalid byte is replaced with a UTF-8 replacement
36    ///   character (`�`), which is 3 bytes in UTF-8 encoding.
37    /// - **Incomplete sequences**: If an incomplete UTF-8 sequence is encountered at the
38    ///   end of available data, the method will buffer it and attempt to complete it
39    ///   on the next read. If EOF is reached with an incomplete sequence, each byte
40    ///   of the incomplete sequence is replaced with a replacement character.
41    ///
42    /// # Examples
43    ///
44    /// ## Reading valid UTF-8 data
45    ///
46    /// ```
47    /// use async_read_super_ext::AsyncReadSuperExt;
48    /// use tokio::io::{BufReader, Cursor};
49    ///
50    /// # #[tokio::main]
51    /// # async fn main() -> std::io::Result<()> {
52    /// let data = "Hello, 🦀 World!";
53    /// let mut reader = BufReader::new(Cursor::new(data.as_bytes()));
54    /// let mut output = Vec::new();
55    ///
56    /// let bytes_read = reader.read_utf8_boundaries_lossy(&mut output).await?;
57    ///
58    /// assert_eq!(bytes_read, data.len());
59    /// assert_eq!(String::from_utf8(output).unwrap(), data);
60    /// # Ok(())
61    /// # }
62    /// ```
63    ///
64    /// ## Handling invalid UTF-8 bytes
65    ///
66    /// ```
67    /// use async_read_super_ext::AsyncReadSuperExt;
68    /// use tokio::io::{BufReader, Cursor};
69    ///
70    /// # #[tokio::main]
71    /// # async fn main() -> std::io::Result<()> {
72    /// // Create data with invalid UTF-8 bytes
73    /// let mut data = Vec::new();
74    /// data.extend_from_slice("Hello ".as_bytes());
75    /// data.push(0xFF); // Invalid UTF-8 byte
76    /// data.push(0xFE); // Invalid UTF-8 byte
77    /// data.extend_from_slice(" World".as_bytes());
78    ///
79    /// let mut reader = BufReader::new(Cursor::new(data));
80    /// let mut output = Vec::new();
81    ///
82    /// let bytes_read = reader.read_utf8_boundaries_lossy(&mut output).await?;
83    ///
84    /// let result = String::from_utf8(output).unwrap();
85    /// assert!(result.contains("Hello "));
86    /// assert!(result.contains(" World"));
87    /// assert!(result.contains('\u{FFFD}')); // Replacement character
88    ///
89    /// // Count replacement characters (should be 2 for the 2 invalid bytes)
90    /// let replacement_count = result.chars().filter(|&c| c == '\u{FFFD}').count();
91    /// assert_eq!(replacement_count, 2);
92    /// # Ok(())
93    /// # }
94    /// ```
95    ///
96    /// ## Reading from a stream until EOF
97    ///
98    /// ```
99    /// use async_read_super_ext::AsyncReadSuperExt;
100    /// use tokio::io::{BufReader, Cursor};
101    ///
102    /// # #[tokio::main]
103    /// # async fn main() -> std::io::Result<()> {
104    /// let data = "Line 1\nLine 2\nLine 3";
105    /// let mut reader = BufReader::new(Cursor::new(data.as_bytes()));
106    /// let mut all_data = Vec::new();
107    /// let mut buffer = Vec::new();
108    ///
109    /// loop {
110    ///     buffer.clear();
111    ///     let bytes_read = reader.read_utf8_boundaries_lossy(&mut buffer).await?;
112    ///     
113    ///     if bytes_read == 0 {
114    ///         break; // EOF reached
115    ///     }
116    ///     
117    ///     all_data.extend_from_slice(&buffer);
118    /// }
119    ///
120    /// let result = String::from_utf8(all_data).unwrap();
121    /// assert_eq!(result, data);
122    /// # Ok(())
123    /// # }
124    /// ```
125    ///
126    /// # Errors
127    ///
128    /// This method will return an error if the underlying reader encounters an I/O error.
129    /// Invalid UTF-8 sequences do not cause errors; they are handled by replacement.
130    fn read_utf8_boundaries_lossy<'a>(&'a mut self, buf: &'a mut Vec<u8>) -> Utf8BoundariesLossy<'a, Self>
131    where
132        Self: Unpin,
133    {
134        read_utf8_boundaries_lossy(self, buf)
135    }
136}
137
138impl<R: AsyncBufRead + ?Sized> AsyncReadSuperExt for R {}