Skip to main content

utf8_decode/
lib.rs

1//! This crates provides incremental UTF-8 decoders implementing the
2//! [`Iterator`] trait, wrapping around [`u8`] bytes iterators.
3//!
4//! It also provide the `const`-compatible [`try_decode_char`] to decode UTF-8
5//! byte streams, even in `const` contexts.
6//!
7//! [`u8`]: std::primitive::u8
8//! [`Iterator`]: std::iter::Iterator
9//! [`try_decode_char`]: crate::try_decode_char
10//!
11//! ## `Decoder`
12//!
13//! The [`Decoder`] iterator can be used, for instance, to decode `u8` slices.
14//!
15//! ```rust
16//! use utf8_decode::Decoder;
17//! # fn main() -> std::io::Result<()> {
18//! let bytes = [72, 101, 108, 108, 111, 32, 87, 111, 114, 108, 100, 33];
19//!
20//! let decoder = Decoder::new(bytes.iter().cloned());
21//!
22//! let mut string = String::new();
23//! for c in decoder {
24//!     string.push(c?);
25//! }
26//!
27//! println!("{}", string);
28//! # Ok(())
29//! # }
30//! ```
31//!
32//! ## `TryDecoder`
33//!
34//! The [`TryDecoder`] iterator can be used, for instance, to decode UTF-8
35//! encoded files.
36//!
37//! ```rust
38//! # use std::{fs::File, io::Read};
39//! use utf8_decode::TryDecoder;
40//! # fn main() -> std::io::Result<()> {
41//! let file = File::open("examples/file.txt")?;
42//!
43//! let decoder = TryDecoder::new(file.bytes());
44//!
45//! let mut string = String::new();
46//! for c in decoder {
47//!     string.push(c?);
48//! }
49//! # Ok(())
50//! # }
51//! ```
52//!
53//! [`TryDecoder`]: crate::fallible::TryDecoder
54#![cfg_attr(not(feature = "std"), no_std)]
55use core::fmt::{self, Debug, Display, Formatter};
56
57mod fallible;
58mod infallible;
59
60pub use fallible::{TryDecoder, try_decode_iter_char};
61pub use infallible::Decoder;
62
63#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
64pub struct Utf8Error {
65    pub offset: usize,
66    pub len: usize,
67}
68
69impl Utf8Error {
70    pub const fn new(offset: usize, len: usize) -> Self {
71        Self { offset, len }
72    }
73}
74
75impl Display for Utf8Error {
76    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
77        write!(f, "invalid UTF-8 sequence")
78    }
79}
80
81impl Debug for Utf8Error {
82    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
83        write!(f, "invalid UTF-8 sequence")
84    }
85}
86
87impl core::error::Error for Utf8Error {}
88
89#[cfg(feature = "std")]
90impl From<Utf8Error> for std::io::Error {
91    fn from(value: Utf8Error) -> Self {
92        Self::new(std::io::ErrorKind::InvalidData, value)
93    }
94}
95
96/// Read the UTF-8 encoded character out of the given slice at position `i`.
97///
98/// Returns the character and its encoded byte length, moving the `i` value to
99/// point to the start of the next character (or end of string).
100pub const fn try_decode_char(bytes: &[u8], i: &mut usize) -> Result<Option<(char, u8)>, Utf8Error> {
101    let offset = *i;
102    match try_decode_codepoint(bytes, offset, i) {
103        Ok(Some((codepoint, len))) => match char::from_u32(codepoint) {
104            Some(c) => Ok(Some((c, len))),
105            None => Err(Utf8Error::new(offset, len as usize)),
106        },
107        Ok(None) => Ok(None),
108        Err(e) => Err(e),
109    }
110}
111
112/// Read the next Unicode codepoint.
113///
114/// - `offset` is the byte offset of the codepoint in the byte string. This will
115///   be returned in any enventual `Utf8Error`.
116///
117/// Returns the codepoint as a `u32` and its encoded byte length.
118const fn try_decode_codepoint(
119    bytes: &[u8],
120    offset: usize,
121    i: &mut usize,
122) -> Result<Option<(u32, u8)>, Utf8Error> {
123    if *i < bytes.len() {
124        let a = bytes[*i] as u32;
125
126        *i += 1;
127
128        if a & 0x80 == 0x00 {
129            // 1 byte.
130            Ok(Some((a, 1)))
131        } else if a & 0xE0 == 0xC0 {
132            // 2 bytes.
133            match try_next_slice_byte(bytes, offset, i) {
134                Ok(b) => Ok(Some(((a & 0x1F) << 6 | b, 2))),
135                Err(e) => Err(e),
136            }
137        } else if a & 0xF0 == 0xE0 {
138            // 3 bytes.
139            match try_next_slice_byte(bytes, offset, i) {
140                Ok(b) => match try_next_slice_byte(bytes, offset, i) {
141                    Ok(c) => Ok(Some(((a & 0x0F) << 12 | b << 6 | c, 3))),
142                    Err(e) => Err(e),
143                },
144                Err(e) => Err(e),
145            }
146        } else if a & 0xF8 == 0xF0 {
147            // 4 bytes.
148            match try_next_slice_byte(bytes, offset, i) {
149                Ok(b) => match try_next_slice_byte(bytes, offset, i) {
150                    Ok(c) => match try_next_slice_byte(bytes, offset, i) {
151                        Ok(d) => Ok(Some(((a & 0x07) << 18 | b << 12 | c << 6 | d, 4))),
152                        Err(e) => Err(e),
153                    },
154                    Err(e) => Err(e),
155                },
156                Err(e) => Err(e),
157            }
158        } else {
159            Err(Utf8Error::new(offset, 1))
160        }
161    } else {
162        Ok(None)
163    }
164}
165
166/// Read the next byte of the UTF-8 character out of the given slice.
167///
168/// - `offset` is the byte offset of the current codepoint.
169///
170/// The byte is returned as a `u32` for later shifting.
171const fn try_next_slice_byte(bytes: &[u8], offset: usize, i: &mut usize) -> Result<u32, Utf8Error> {
172    if *i < bytes.len() {
173        let c = bytes[*i];
174
175        *i += 1;
176
177        if c & 0xC0 == 0x80 {
178            Ok((c & 0x3F) as u32)
179        } else {
180            Err(Utf8Error::new(offset, *i - offset))
181        }
182    } else {
183        Err(Utf8Error::new(offset, *i - offset))
184    }
185}