utf8_zero/lib.rs
1#![no_std]
2#![deny(missing_docs)]
3
4//! Zero-copy, incremental UTF-8 decoding with error handling.
5//!
6//! Three levels of API:
7//!
8//! * [`decode()`] -- low-level, single-shot decode of a byte slice. Returns the valid
9//! prefix and either an invalid sequence or an incomplete suffix that can be completed
10//! with more input.
11//! * [`LossyDecoder`] -- a push-based streaming decoder. Feed it chunks of bytes and it
12//! calls back with `&str` slices, replacing errors with U+FFFD.
13//! * [`BufReadDecoder`] (requires the `std` feature) -- a pull-based streaming decoder
14//! wrapping any [`std::io::BufRead`], with both strict and lossy modes.
15
16#[cfg(feature = "std")]
17extern crate std;
18
19mod lossy;
20#[cfg(feature = "std")]
21mod read;
22
23pub use lossy::LossyDecoder;
24#[cfg(feature = "std")]
25pub use read::{BufReadDecoder, BufReadDecoderError};
26
27use core::cmp;
28use core::fmt;
29use core::str;
30
31/// The replacement character, U+FFFD. In lossy decoding, insert it for every decoding error.
32pub const REPLACEMENT_CHARACTER: &str = "\u{FFFD}";
33
34/// Error from [`decode()`] when the input is not entirely valid UTF-8.
35#[derive(Debug, Copy, Clone)]
36pub enum DecodeError<'a> {
37 /// In lossy decoding insert `valid_prefix`, then `"\u{FFFD}"`,
38 /// then call `decode()` again with `remaining_input`.
39 Invalid {
40 /// The leading valid UTF-8 portion of the input.
41 valid_prefix: &'a str,
42 /// The bytes that form the invalid sequence.
43 invalid_sequence: &'a [u8],
44 /// The bytes after the invalid sequence, not yet decoded.
45 remaining_input: &'a [u8],
46 },
47
48 /// Call the `incomplete_suffix.try_complete` method with more input when available.
49 /// If no more input is available, this is an invalid byte sequence.
50 Incomplete {
51 /// The leading valid UTF-8 portion of the input.
52 valid_prefix: &'a str,
53 /// The trailing bytes that start a multi-byte code point but are not complete.
54 incomplete_suffix: Incomplete,
55 },
56}
57
58impl<'a> fmt::Display for DecodeError<'a> {
59 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
60 match *self {
61 DecodeError::Invalid {
62 valid_prefix,
63 invalid_sequence,
64 remaining_input,
65 } => write!(
66 f,
67 "found invalid byte sequence {invalid_sequence:02x?} after \
68 {valid_byte_count} valid bytes, followed by {unprocessed_byte_count} more \
69 unprocessed bytes",
70 invalid_sequence = invalid_sequence,
71 valid_byte_count = valid_prefix.len(),
72 unprocessed_byte_count = remaining_input.len()
73 ),
74 DecodeError::Incomplete {
75 valid_prefix,
76 incomplete_suffix,
77 } => write!(
78 f,
79 "found incomplete byte sequence {incomplete_suffix:02x?} after \
80 {valid_byte_count} bytes",
81 incomplete_suffix = incomplete_suffix,
82 valid_byte_count = valid_prefix.len()
83 ),
84 }
85 }
86}
87
88#[cfg(feature = "std")]
89impl<'a> std::error::Error for DecodeError<'a> {}
90
91/// An incomplete byte sequence for a multi-byte UTF-8 code point.
92///
93/// Feed more bytes via [`try_complete()`](Incomplete::try_complete) to finish decoding.
94#[derive(Debug, Copy, Clone)]
95pub struct Incomplete {
96 /// Internal buffer holding the incomplete bytes (up to 4).
97 pub buffer: [u8; 4],
98 /// How many bytes in `buffer` are occupied.
99 pub buffer_len: u8,
100}
101
102/// Decode a byte slice as UTF-8, returning the valid prefix on error.
103///
104/// Unlike [`std::str::from_utf8()`], this distinguishes between invalid and
105/// incomplete byte sequences so that callers can request more input.
106///
107/// ```
108/// use utf8_zero::{decode, DecodeError};
109///
110/// // Fully valid input.
111/// assert_eq!(decode(b"hello").unwrap(), "hello");
112///
113/// // Invalid byte — returns the valid prefix and the bad sequence.
114/// match decode(b"hello\xC0world") {
115/// Err(DecodeError::Invalid { valid_prefix, .. }) => {
116/// assert_eq!(valid_prefix, "hello");
117/// }
118/// _ => unreachable!(),
119/// }
120///
121/// // Input ends mid-codepoint — returns Incomplete so the caller can
122/// // supply more bytes.
123/// match decode(b"\xC3") {
124/// Err(DecodeError::Incomplete { valid_prefix, incomplete_suffix }) => {
125/// assert_eq!(valid_prefix, "");
126/// assert_eq!(incomplete_suffix.buffer_len, 1);
127/// }
128/// _ => unreachable!(),
129/// }
130/// ```
131pub fn decode(input: &[u8]) -> Result<&str, DecodeError<'_>> {
132 let error = match str::from_utf8(input) {
133 Ok(valid) => return Ok(valid),
134 Err(error) => error,
135 };
136
137 // FIXME: separate function from here to guide inlining?
138 let (valid, after_valid) = input.split_at(error.valid_up_to());
139 let valid = unsafe { str::from_utf8_unchecked(valid) };
140
141 match error.error_len() {
142 Some(invalid_sequence_length) => {
143 let (invalid, rest) = after_valid.split_at(invalid_sequence_length);
144 Err(DecodeError::Invalid {
145 valid_prefix: valid,
146 invalid_sequence: invalid,
147 remaining_input: rest,
148 })
149 }
150 None => Err(DecodeError::Incomplete {
151 valid_prefix: valid,
152 incomplete_suffix: Incomplete::new(after_valid),
153 }),
154 }
155}
156
157impl Incomplete {
158 /// Create an empty `Incomplete` with no buffered bytes.
159 pub fn empty() -> Self {
160 Incomplete {
161 buffer: [0, 0, 0, 0],
162 buffer_len: 0,
163 }
164 }
165
166 /// Returns `true` if no bytes are buffered.
167 pub fn is_empty(&self) -> bool {
168 self.buffer_len == 0
169 }
170
171 /// Create an `Incomplete` pre-filled with the given bytes.
172 pub fn new(bytes: &[u8]) -> Self {
173 let mut buffer = [0, 0, 0, 0];
174 let len = bytes.len();
175 buffer[..len].copy_from_slice(bytes);
176 Incomplete {
177 buffer,
178 buffer_len: len as u8,
179 }
180 }
181
182 /// * `None`: still incomplete, call `try_complete` again with more input.
183 /// If no more input is available, this is invalid byte sequence.
184 /// * `Some((result, remaining_input))`: We’re done with this `Incomplete`.
185 /// To keep decoding, pass `remaining_input` to `decode()`.
186 #[allow(clippy::type_complexity)]
187 pub fn try_complete<'input>(
188 &mut self,
189 input: &'input [u8],
190 ) -> Option<(Result<&str, &[u8]>, &'input [u8])> {
191 let (consumed, opt_result) = self.try_complete_offsets(input);
192 let result = opt_result?;
193 let remaining_input = &input[consumed..];
194 let result_bytes = self.take_buffer();
195 let result = match result {
196 Ok(()) => Ok(unsafe { str::from_utf8_unchecked(result_bytes) }),
197 Err(()) => Err(result_bytes),
198 };
199 Some((result, remaining_input))
200 }
201
202 fn take_buffer(&mut self) -> &[u8] {
203 let len = self.buffer_len as usize;
204 self.buffer_len = 0;
205 &self.buffer[..len]
206 }
207
208 /// (consumed_from_input, None): not enough input
209 /// (consumed_from_input, Some(Err(()))): error bytes in buffer
210 /// (consumed_from_input, Some(Ok(()))): UTF-8 string in buffer
211 fn try_complete_offsets(&mut self, input: &[u8]) -> (usize, Option<Result<(), ()>>) {
212 let initial_buffer_len = self.buffer_len as usize;
213 let copied_from_input;
214 {
215 let unwritten = &mut self.buffer[initial_buffer_len..];
216 copied_from_input = cmp::min(unwritten.len(), input.len());
217 unwritten[..copied_from_input].copy_from_slice(&input[..copied_from_input]);
218 }
219 let spliced = &self.buffer[..initial_buffer_len + copied_from_input];
220 match str::from_utf8(spliced) {
221 Ok(_) => {
222 self.buffer_len = spliced.len() as u8;
223 (copied_from_input, Some(Ok(())))
224 }
225 Err(error) => {
226 let valid_up_to = error.valid_up_to();
227 if valid_up_to > 0 {
228 let consumed = valid_up_to.checked_sub(initial_buffer_len).unwrap();
229 self.buffer_len = valid_up_to as u8;
230 (consumed, Some(Ok(())))
231 } else {
232 match error.error_len() {
233 Some(invalid_sequence_length) => {
234 let consumed = invalid_sequence_length
235 .checked_sub(initial_buffer_len)
236 .unwrap();
237 self.buffer_len = invalid_sequence_length as u8;
238 (consumed, Some(Err(())))
239 }
240 None => {
241 self.buffer_len = spliced.len() as u8;
242 (copied_from_input, None)
243 }
244 }
245 }
246 }
247 }
248 }
249}