utf8_decode/lib.rs
1//! This crates provides incremental UTF-8 decoders implementing the
2//! [`Iterator`] trait, wrapping around [`u8`] bytes iterators.
3//!
4//! It also provide the `const`-compatible [`try_decode_char`] to decode UTF-8
5//! byte streams, even in `const` contexts.
6//!
7//! [`u8`]: std::primitive::u8
8//! [`Iterator`]: std::iter::Iterator
9//! [`try_decode_char`]: crate::try_decode_char
10//!
11//! ## `Decoder`
12//!
13//! The [`Decoder`] iterator can be used, for instance, to decode `u8` slices.
14//!
15//! ```rust
16//! use utf8_decode::Decoder;
17//! # fn main() -> std::io::Result<()> {
18//! let bytes = [72, 101, 108, 108, 111, 32, 87, 111, 114, 108, 100, 33];
19//!
20//! let decoder = Decoder::new(bytes.iter().cloned());
21//!
22//! let mut string = String::new();
23//! for c in decoder {
24//! string.push(c?);
25//! }
26//!
27//! println!("{}", string);
28//! # Ok(())
29//! # }
30//! ```
31//!
32//! ## `TryDecoder`
33//!
34//! The [`TryDecoder`] iterator can be used, for instance, to decode UTF-8
35//! encoded files.
36//!
37//! ```rust
38//! # use std::{fs::File, io::Read};
39//! use utf8_decode::TryDecoder;
40//! # fn main() -> std::io::Result<()> {
41//! let file = File::open("examples/file.txt")?;
42//!
43//! let decoder = TryDecoder::new(file.bytes());
44//!
45//! let mut string = String::new();
46//! for c in decoder {
47//! string.push(c?);
48//! }
49//! # Ok(())
50//! # }
51//! ```
52//!
53//! [`TryDecoder`]: crate::fallible::TryDecoder
54#![cfg_attr(not(feature = "std"), no_std)]
55use core::fmt::{self, Debug, Display, Formatter};
56
57mod fallible;
58mod infallible;
59
60pub use fallible::{TryDecoder, try_decode_iter_char};
61pub use infallible::Decoder;
62
63#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
64pub struct Utf8Error {
65 pub offset: usize,
66 pub len: usize,
67}
68
69impl Utf8Error {
70 pub const fn new(offset: usize, len: usize) -> Self {
71 Self { offset, len }
72 }
73}
74
75impl Display for Utf8Error {
76 fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
77 write!(f, "invalid UTF-8 sequence")
78 }
79}
80
81impl Debug for Utf8Error {
82 fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
83 write!(f, "invalid UTF-8 sequence")
84 }
85}
86
87impl core::error::Error for Utf8Error {}
88
89#[cfg(feature = "std")]
90impl From<Utf8Error> for std::io::Error {
91 fn from(value: Utf8Error) -> Self {
92 Self::new(std::io::ErrorKind::InvalidData, value)
93 }
94}
95
96/// Read the UTF-8 encoded character out of the given slice at position `i`.
97///
98/// Returns the character and its encoded byte length, moving the `i` value to
99/// point to the start of the next character (or end of string).
100pub const fn try_decode_char(bytes: &[u8], i: &mut usize) -> Result<Option<(char, u8)>, Utf8Error> {
101 let offset = *i;
102 match try_decode_codepoint(bytes, offset, i) {
103 Ok(Some((codepoint, len))) => match char::from_u32(codepoint) {
104 Some(c) => Ok(Some((c, len))),
105 None => Err(Utf8Error::new(offset, len as usize)),
106 },
107 Ok(None) => Ok(None),
108 Err(e) => Err(e),
109 }
110}
111
112/// Read the next Unicode codepoint.
113///
114/// - `offset` is the byte offset of the codepoint in the byte string. This will
115/// be returned in any enventual `Utf8Error`.
116///
117/// Returns the codepoint as a `u32` and its encoded byte length.
118const fn try_decode_codepoint(
119 bytes: &[u8],
120 offset: usize,
121 i: &mut usize,
122) -> Result<Option<(u32, u8)>, Utf8Error> {
123 if *i < bytes.len() {
124 let a = bytes[*i] as u32;
125
126 *i += 1;
127
128 if a & 0x80 == 0x00 {
129 // 1 byte.
130 Ok(Some((a, 1)))
131 } else if a & 0xE0 == 0xC0 {
132 // 2 bytes.
133 match try_next_slice_byte(bytes, offset, i) {
134 Ok(b) => Ok(Some(((a & 0x1F) << 6 | b, 2))),
135 Err(e) => Err(e),
136 }
137 } else if a & 0xF0 == 0xE0 {
138 // 3 bytes.
139 match try_next_slice_byte(bytes, offset, i) {
140 Ok(b) => match try_next_slice_byte(bytes, offset, i) {
141 Ok(c) => Ok(Some(((a & 0x0F) << 12 | b << 6 | c, 3))),
142 Err(e) => Err(e),
143 },
144 Err(e) => Err(e),
145 }
146 } else if a & 0xF8 == 0xF0 {
147 // 4 bytes.
148 match try_next_slice_byte(bytes, offset, i) {
149 Ok(b) => match try_next_slice_byte(bytes, offset, i) {
150 Ok(c) => match try_next_slice_byte(bytes, offset, i) {
151 Ok(d) => Ok(Some(((a & 0x07) << 18 | b << 12 | c << 6 | d, 4))),
152 Err(e) => Err(e),
153 },
154 Err(e) => Err(e),
155 },
156 Err(e) => Err(e),
157 }
158 } else {
159 Err(Utf8Error::new(offset, 1))
160 }
161 } else {
162 Ok(None)
163 }
164}
165
166/// Read the next byte of the UTF-8 character out of the given slice.
167///
168/// - `offset` is the byte offset of the current codepoint.
169///
170/// The byte is returned as a `u32` for later shifting.
171const fn try_next_slice_byte(bytes: &[u8], offset: usize, i: &mut usize) -> Result<u32, Utf8Error> {
172 if *i < bytes.len() {
173 let c = bytes[*i];
174
175 *i += 1;
176
177 if c & 0xC0 == 0x80 {
178 Ok((c & 0x3F) as u32)
179 } else {
180 Err(Utf8Error::new(offset, *i - offset))
181 }
182 } else {
183 Err(Utf8Error::new(offset, *i - offset))
184 }
185}