Skip to main content

utf8_zero/
lib.rs

1#![no_std]
2#![deny(missing_docs)]
3
4//! Zero-copy, incremental UTF-8 decoding with error handling.
5//!
6//! Three levels of API:
7//!
8//! * [`decode()`] -- low-level, single-shot decode of a byte slice. Returns the valid
9//!   prefix and either an invalid sequence or an incomplete suffix that can be completed
10//!   with more input.
11//! * [`LossyDecoder`] -- a push-based streaming decoder. Feed it chunks of bytes and it
12//!   calls back with `&str` slices, replacing errors with U+FFFD.
13//! * [`BufReadDecoder`] (requires the `std` feature) -- a pull-based streaming decoder
14//!   wrapping any [`std::io::BufRead`], with both strict and lossy modes.
15
16#[cfg(feature = "std")]
17extern crate std;
18
19mod lossy;
20#[cfg(feature = "std")]
21mod read;
22
23pub use lossy::LossyDecoder;
24#[cfg(feature = "std")]
25pub use read::{BufReadDecoder, BufReadDecoderError};
26
27use core::cmp;
28use core::fmt;
29use core::str;
30
31/// The replacement character, U+FFFD. In lossy decoding, insert it for every decoding error.
32pub const REPLACEMENT_CHARACTER: &str = "\u{FFFD}";
33
34/// Error from [`decode()`] when the input is not entirely valid UTF-8.
35#[derive(Debug, Copy, Clone)]
36pub enum DecodeError<'a> {
37    /// In lossy decoding insert `valid_prefix`, then `"\u{FFFD}"`,
38    /// then call `decode()` again with `remaining_input`.
39    Invalid {
40        /// The leading valid UTF-8 portion of the input.
41        valid_prefix: &'a str,
42        /// The bytes that form the invalid sequence.
43        invalid_sequence: &'a [u8],
44        /// The bytes after the invalid sequence, not yet decoded.
45        remaining_input: &'a [u8],
46    },
47
48    /// Call the `incomplete_suffix.try_complete` method with more input when available.
49    /// If no more input is available, this is an invalid byte sequence.
50    Incomplete {
51        /// The leading valid UTF-8 portion of the input.
52        valid_prefix: &'a str,
53        /// The trailing bytes that start a multi-byte code point but are not complete.
54        incomplete_suffix: Incomplete,
55    },
56}
57
58impl<'a> fmt::Display for DecodeError<'a> {
59    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
60        match *self {
61            DecodeError::Invalid {
62                valid_prefix,
63                invalid_sequence,
64                remaining_input,
65            } => write!(
66                f,
67                "found invalid byte sequence {invalid_sequence:02x?} after \
68                 {valid_byte_count} valid bytes, followed by {unprocessed_byte_count} more \
69                 unprocessed bytes",
70                invalid_sequence = invalid_sequence,
71                valid_byte_count = valid_prefix.len(),
72                unprocessed_byte_count = remaining_input.len()
73            ),
74            DecodeError::Incomplete {
75                valid_prefix,
76                incomplete_suffix,
77            } => write!(
78                f,
79                "found incomplete byte sequence {incomplete_suffix:02x?} after \
80                 {valid_byte_count} bytes",
81                incomplete_suffix = incomplete_suffix,
82                valid_byte_count = valid_prefix.len()
83            ),
84        }
85    }
86}
87
88#[cfg(feature = "std")]
89impl<'a> std::error::Error for DecodeError<'a> {}
90
91/// An incomplete byte sequence for a multi-byte UTF-8 code point.
92///
93/// Feed more bytes via [`try_complete()`](Incomplete::try_complete) to finish decoding.
94#[derive(Debug, Copy, Clone)]
95pub struct Incomplete {
96    /// Internal buffer holding the incomplete bytes (up to 4).
97    pub buffer: [u8; 4],
98    /// How many bytes in `buffer` are occupied.
99    pub buffer_len: u8,
100}
101
102/// Decode a byte slice as UTF-8, returning the valid prefix on error.
103///
104/// Unlike [`std::str::from_utf8()`], this distinguishes between invalid and
105/// incomplete byte sequences so that callers can request more input.
106///
107/// ```
108/// use utf8_zero::{decode, DecodeError};
109///
110/// // Fully valid input.
111/// assert_eq!(decode(b"hello").unwrap(), "hello");
112///
113/// // Invalid byte — returns the valid prefix and the bad sequence.
114/// match decode(b"hello\xC0world") {
115///     Err(DecodeError::Invalid { valid_prefix, .. }) => {
116///         assert_eq!(valid_prefix, "hello");
117///     }
118///     _ => unreachable!(),
119/// }
120///
121/// // Input ends mid-codepoint — returns Incomplete so the caller can
122/// // supply more bytes.
123/// match decode(b"\xC3") {
124///     Err(DecodeError::Incomplete { valid_prefix, incomplete_suffix }) => {
125///         assert_eq!(valid_prefix, "");
126///         assert_eq!(incomplete_suffix.buffer_len, 1);
127///     }
128///     _ => unreachable!(),
129/// }
130/// ```
131pub fn decode(input: &[u8]) -> Result<&str, DecodeError<'_>> {
132    let error = match str::from_utf8(input) {
133        Ok(valid) => return Ok(valid),
134        Err(error) => error,
135    };
136
137    // FIXME: separate function from here to guide inlining?
138    let (valid, after_valid) = input.split_at(error.valid_up_to());
139    let valid = unsafe { str::from_utf8_unchecked(valid) };
140
141    match error.error_len() {
142        Some(invalid_sequence_length) => {
143            let (invalid, rest) = after_valid.split_at(invalid_sequence_length);
144            Err(DecodeError::Invalid {
145                valid_prefix: valid,
146                invalid_sequence: invalid,
147                remaining_input: rest,
148            })
149        }
150        None => Err(DecodeError::Incomplete {
151            valid_prefix: valid,
152            incomplete_suffix: Incomplete::new(after_valid),
153        }),
154    }
155}
156
157impl Incomplete {
158    /// Create an empty `Incomplete` with no buffered bytes.
159    pub fn empty() -> Self {
160        Incomplete {
161            buffer: [0, 0, 0, 0],
162            buffer_len: 0,
163        }
164    }
165
166    /// Returns `true` if no bytes are buffered.
167    pub fn is_empty(&self) -> bool {
168        self.buffer_len == 0
169    }
170
171    /// Create an `Incomplete` pre-filled with the given bytes.
172    pub fn new(bytes: &[u8]) -> Self {
173        let mut buffer = [0, 0, 0, 0];
174        let len = bytes.len();
175        buffer[..len].copy_from_slice(bytes);
176        Incomplete {
177            buffer,
178            buffer_len: len as u8,
179        }
180    }
181
182    /// * `None`: still incomplete, call `try_complete` again with more input.
183    ///   If no more input is available, this is invalid byte sequence.
184    /// * `Some((result, remaining_input))`: We’re done with this `Incomplete`.
185    ///   To keep decoding, pass `remaining_input` to `decode()`.
186    #[allow(clippy::type_complexity)]
187    pub fn try_complete<'input>(
188        &mut self,
189        input: &'input [u8],
190    ) -> Option<(Result<&str, &[u8]>, &'input [u8])> {
191        let (consumed, opt_result) = self.try_complete_offsets(input);
192        let result = opt_result?;
193        let remaining_input = &input[consumed..];
194        let result_bytes = self.take_buffer();
195        let result = match result {
196            Ok(()) => Ok(unsafe { str::from_utf8_unchecked(result_bytes) }),
197            Err(()) => Err(result_bytes),
198        };
199        Some((result, remaining_input))
200    }
201
202    fn take_buffer(&mut self) -> &[u8] {
203        let len = self.buffer_len as usize;
204        self.buffer_len = 0;
205        &self.buffer[..len]
206    }
207
208    /// (consumed_from_input, None): not enough input
209    /// (consumed_from_input, Some(Err(()))): error bytes in buffer
210    /// (consumed_from_input, Some(Ok(()))): UTF-8 string in buffer
211    fn try_complete_offsets(&mut self, input: &[u8]) -> (usize, Option<Result<(), ()>>) {
212        let initial_buffer_len = self.buffer_len as usize;
213        let copied_from_input;
214        {
215            let unwritten = &mut self.buffer[initial_buffer_len..];
216            copied_from_input = cmp::min(unwritten.len(), input.len());
217            unwritten[..copied_from_input].copy_from_slice(&input[..copied_from_input]);
218        }
219        let spliced = &self.buffer[..initial_buffer_len + copied_from_input];
220        match str::from_utf8(spliced) {
221            Ok(_) => {
222                self.buffer_len = spliced.len() as u8;
223                (copied_from_input, Some(Ok(())))
224            }
225            Err(error) => {
226                let valid_up_to = error.valid_up_to();
227                if valid_up_to > 0 {
228                    let consumed = valid_up_to.checked_sub(initial_buffer_len).unwrap();
229                    self.buffer_len = valid_up_to as u8;
230                    (consumed, Some(Ok(())))
231                } else {
232                    match error.error_len() {
233                        Some(invalid_sequence_length) => {
234                            let consumed = invalid_sequence_length
235                                .checked_sub(initial_buffer_len)
236                                .unwrap();
237                            self.buffer_len = invalid_sequence_length as u8;
238                            (consumed, Some(Err(())))
239                        }
240                        None => {
241                            self.buffer_len = spliced.len() as u8;
242                            (copied_from_input, None)
243                        }
244                    }
245                }
246            }
247        }
248    }
249}