async_read_super_ext/lib.rs
1mod utf8_boundaries;
2
3use tokio::io::AsyncBufRead;
4
5use self::utf8_boundaries::{Utf8BoundariesLossy, read_utf8_boundaries_lossy};
6
7pub trait AsyncReadSuperExt: AsyncBufRead {
8 /// Reads data from the async reader while respecting UTF-8 character boundaries.
9 ///
10 /// This method reads data from the underlying async reader and ensures that the output
11 /// buffer contains only valid UTF-8 sequences. Any invalid UTF-8 bytes are replaced
12 /// with Unicode replacement characters (`U+FFFD`).
13 ///
14 /// # Features
15 ///
16 /// - **UTF-8 Boundary Awareness**: Handles incomplete UTF-8 sequences that span across
17 /// multiple read operations by buffering partial characters.
18 /// - **Lossy Conversion**: Invalid UTF-8 bytes are replaced with replacement characters
19 /// rather than causing errors.
20 /// - **Efficient Processing**: Valid UTF-8 data is processed without additional copying
21 /// when possible.
22 ///
23 /// # Arguments
24 ///
25 /// * `buf` - A mutable reference to a `Vec<u8>` where the valid UTF-8 data will be written.
26 /// The buffer will be extended with new data, not replaced.
27 ///
28 /// # Returns
29 ///
30 /// Returns a future that resolves to `io::Result<usize>` where the `usize` indicates
31 /// the number of bytes written to the buffer. A return value of `0` indicates EOF.
32 ///
33 /// # Behavior with Invalid UTF-8
34 ///
35 /// - **Invalid sequences**: Each invalid byte is replaced with a UTF-8 replacement
36 /// character (`�`), which is 3 bytes in UTF-8 encoding.
37 /// - **Incomplete sequences**: If an incomplete UTF-8 sequence is encountered at the
38 /// end of available data, the method will buffer it and attempt to complete it
39 /// on the next read. If EOF is reached with an incomplete sequence, each byte
40 /// of the incomplete sequence is replaced with a replacement character.
41 ///
42 /// # Examples
43 ///
44 /// ## Reading valid UTF-8 data
45 ///
46 /// ```
47 /// use async_read_super_ext::AsyncReadSuperExt;
48 /// use tokio::io::{BufReader, Cursor};
49 ///
50 /// # #[tokio::main]
51 /// # async fn main() -> std::io::Result<()> {
52 /// let data = "Hello, 🦀 World!";
53 /// let mut reader = BufReader::new(Cursor::new(data.as_bytes()));
54 /// let mut output = Vec::new();
55 ///
56 /// let bytes_read = reader.read_utf8_boundaries_lossy(&mut output).await?;
57 ///
58 /// assert_eq!(bytes_read, data.len());
59 /// assert_eq!(String::from_utf8(output).unwrap(), data);
60 /// # Ok(())
61 /// # }
62 /// ```
63 ///
64 /// ## Handling invalid UTF-8 bytes
65 ///
66 /// ```
67 /// use async_read_super_ext::AsyncReadSuperExt;
68 /// use tokio::io::{BufReader, Cursor};
69 ///
70 /// # #[tokio::main]
71 /// # async fn main() -> std::io::Result<()> {
72 /// // Create data with invalid UTF-8 bytes
73 /// let mut data = Vec::new();
74 /// data.extend_from_slice("Hello ".as_bytes());
75 /// data.push(0xFF); // Invalid UTF-8 byte
76 /// data.push(0xFE); // Invalid UTF-8 byte
77 /// data.extend_from_slice(" World".as_bytes());
78 ///
79 /// let mut reader = BufReader::new(Cursor::new(data));
80 /// let mut output = Vec::new();
81 ///
82 /// let bytes_read = reader.read_utf8_boundaries_lossy(&mut output).await?;
83 ///
84 /// let result = String::from_utf8(output).unwrap();
85 /// assert!(result.contains("Hello "));
86 /// assert!(result.contains(" World"));
87 /// assert!(result.contains('\u{FFFD}')); // Replacement character
88 ///
89 /// // Count replacement characters (should be 2 for the 2 invalid bytes)
90 /// let replacement_count = result.chars().filter(|&c| c == '\u{FFFD}').count();
91 /// assert_eq!(replacement_count, 2);
92 /// # Ok(())
93 /// # }
94 /// ```
95 ///
96 /// ## Reading from a stream until EOF
97 ///
98 /// ```
99 /// use async_read_super_ext::AsyncReadSuperExt;
100 /// use tokio::io::{BufReader, Cursor};
101 ///
102 /// # #[tokio::main]
103 /// # async fn main() -> std::io::Result<()> {
104 /// let data = "Line 1\nLine 2\nLine 3";
105 /// let mut reader = BufReader::new(Cursor::new(data.as_bytes()));
106 /// let mut all_data = Vec::new();
107 /// let mut buffer = Vec::new();
108 ///
109 /// loop {
110 /// buffer.clear();
111 /// let bytes_read = reader.read_utf8_boundaries_lossy(&mut buffer).await?;
112 ///
113 /// if bytes_read == 0 {
114 /// break; // EOF reached
115 /// }
116 ///
117 /// all_data.extend_from_slice(&buffer);
118 /// }
119 ///
120 /// let result = String::from_utf8(all_data).unwrap();
121 /// assert_eq!(result, data);
122 /// # Ok(())
123 /// # }
124 /// ```
125 ///
126 /// # Errors
127 ///
128 /// This method will return an error if the underlying reader encounters an I/O error.
129 /// Invalid UTF-8 sequences do not cause errors; they are handled by replacement.
130 fn read_utf8_boundaries_lossy<'a>(&'a mut self, buf: &'a mut Vec<u8>) -> Utf8BoundariesLossy<'a, Self>
131 where
132 Self: Unpin,
133 {
134 read_utf8_boundaries_lossy(self, buf)
135 }
136}
137
138impl<R: AsyncBufRead + ?Sized> AsyncReadSuperExt for R {}