rut/lib.rs
1#![no_std]
2//! Rut is a small UTF-8 decoding library for applications that need to decode individual characters.\
3//! It provides a bytewise decoder, and functions for decoding byte slices.
4//!
5//! It is completely `no_std` and should provide good performance.<sup>[*citation needed*]</sup>
6//!
7//! # Conformance
8//!
9//! Rut is fully conformant to the specifications and restrictions of the [Unicode standard][unicode].\
10//! Additionally, it follows [W3C's standard for UTF-8 decoding][w3c] with regards to error signalling.
11//!
12//! # Testing
13//!
14//! Some tests are in place, however it is not comprehensive yet.
15//! However, Rut has been pretty thoroughly fuzzed on random input and passes this [stress test for UTF-8 decoders][stress].
16//!
17//! # As Seen on TV!
18//!
19//! Rut began life, and is still used in, [Termiku], a terminal emulator written in Rust.
20//!
21//! [unicode]: https://www.unicode.org/versions/latest/
22//! [w3c]: https://www.w3.org/TR/encoding/#utf-8-decoder
23//! [stress]: https://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
24//! [Termiku]: https://github.com/ShinySaana/Termiku
25
26/// Bitmasks for UTF-8 bytes.
27/// 10xx xxxx for continuation bytes,
28/// 110x xxxx for 2 byte,
29/// 1110 xxxx for 3 byte, and
30/// 1111 0xxx for 4 byte leading bytes.
31const MASK: [u8; 4] = [0x3F, 0x1F, 0x0F, 0x07];
32
33/// UTF-8 length table.
34/// This is blatantly taken from Rust's corelib.
35/// However, the API for it is unstable, so we supply our own.
36/// https://doc.rust-lang.org/stable/core/str/fn.utf8_char_width.html
37const UTF8_LENGTH_TABLE: [u8; 256] = [
38 // 0x00-0x7F: 1 byte.
39 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
40 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
41 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
42 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
43 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
44 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
45 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
46 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
47 // 0x80-0xBF: continuation bytes.
48 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
49 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
50 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
51 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
52 // 0xC2-0xDF: 2 bytes.
53 0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
54 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
55 // 0xE0-0xEF: 3 bytes.
56 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
57 // 0xF0-0xF4: 4 bytes.
58 4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0,
59];
60
61/// Result type for the [`Decoder::decode_byte`](struct.Decoder.html#method.decode_byte) method.
62///
63/// Note that this is *not* an alias for Rust's default `Result` type.
64#[derive(Copy, Clone, Eq, PartialEq, Debug)]
65pub enum DecoderResult {
66 /// A byte was decoded successfully.
67 Continue,
68 /// A character was decoded successfully.
69 Char(char),
70 /// An error occured.
71 Error(self::Error),
72}
73
74/// The error type returned by all decoding methods.
75#[derive(Copy, Clone, Eq, PartialEq, Debug)]
76pub enum Error {
77 /// An invalid byte value was encountered.
78 InvalidByte,
79
80 /// A continuation byte was encountered outside of a sequence.
81 UnexpectedContinuation,
82
83 /// A sequence was terminated by a byte that isn't a valid continuation.
84 BrokenSequence,
85
86 /// An [overlong encoding](https://en.wikipedia.org/wiki/UTF-8#Overlong_encodings) was encountered.
87 OverlongEncoding,
88
89 /// An encoding for an invalid unicode scalar value was encountered.
90 InvalidCodePoint,
91
92 /// The end of the sequence was reached before it could be fully decoded.
93 TruncatedSequence,
94}
95
96/// A bytewise UTF-8 decoder.
97#[derive(Copy, Clone, Debug, Default)]
98pub struct Decoder {
99 /// Unicode Scalar Value decoded up to this point.
100 value: u32,
101 /// Number of continuation bytes in this sequence.
102 needed: u8,
103 /// Leading byte of this sequence.
104 lead: u8,
105}
106
107impl Decoder {
108 /// Creates a new decoder.
109 #[inline]
110 pub fn new() -> Decoder {
111 Decoder::default()
112 }
113
114 /// Decodes a single byte.
115 ///
116 /// # Correct Use
117 ///
118 /// Due to the nature of bytewise UTF-8 decoding, blindly continuing through errors will lead to vastly incorrect results.
119 ///
120 /// For example, the byte sequence `C2, 41, 42` would return the results `<Continue>, <Error>, B`.\
121 /// However, Unicode expects this sequence to be treated as `<Error>, A, B`.
122 ///
123 /// This means that, when an error is encountered inside of a sequence,
124 /// the offending byte should be passsed to `decode_byte` again.\
125 /// This is because an ill-formed sequence ends at the last valid byte, not at the first invalid byte.
126 ///
127 /// This is essentially what the [`decode_one`](fn.decode_one.html) function does.
128 ///
129 /// # Example
130 ///
131 /// Performing lossy decoding with `Decoder`:
132 ///
133 /// ```
134 /// use std::fmt::Write;
135 /// use rut::{Decoder, DecoderResult, Error::*};
136 ///
137 /// let mut s = String::new();
138 /// let mut d = Decoder::new();
139 ///
140 /// // An ill-formed UTF-8 sequence as described above.
141 /// let bytes = [0xC2, 0x41, 0x42];
142 ///
143 /// for &byte in &bytes {
144 /// let mut result = d.decode_byte(byte);
145 ///
146 /// if let DecoderResult::Error(e) = result {
147 /// match e {
148 /// BrokenSequence
149 /// | OverlongEncoding
150 /// | InvalidCodePoint => {
151 /// write!(&mut s, "\u{FFFD}").unwrap();
152 /// result = d.decode_byte(byte);
153 /// },
154 /// _ => {}
155 /// }
156 /// }
157 ///
158 /// match result {
159 /// DecoderResult::Continue => continue,
160 /// DecoderResult::Char(c) => write!(&mut s, "{}", c).unwrap(),
161 /// DecoderResult::Error(_) => write!(&mut s, "\u{FFFD}").unwrap()
162 /// }
163 /// }
164 ///
165 /// assert_eq!(&s, "�AB");
166 /// ```
167 pub fn decode_byte(&mut self, byte: u8) -> DecoderResult {
168 use self::Error::*;
169
170 // These values can never appear.
171 // More specifically, these would be leading bytes.
172 // C0 and C1 can only produce overlong encodings of U+0000 through U+007F
173 // F5 through FF can only produce scalar values greater than U+10FFFF, or encodings longer than 4 bytes.
174 match byte {
175 0xC0 | 0xC1 | 0xF5..=0xFF => {
176 self.needed = 0;
177 return DecoderResult::Error(InvalidByte);
178 }
179 _ => {}
180 }
181
182 if self.needed == 0 {
183 match byte {
184 0x00..=0x7F => return DecoderResult::Char(char::from(byte)),
185 0x80..=0xBF => return DecoderResult::Error(UnexpectedContinuation),
186 _ => {}
187 }
188
189 let needed = UTF8_LENGTH_TABLE[byte as usize] - 1;
190
191 self.value = (byte & MASK[needed as usize]) as u32;
192 self.needed = needed as u8;
193 self.lead = byte;
194
195 return DecoderResult::Continue;
196 }
197
198 if !(0x80..=0xBF).contains(&byte) {
199 self.needed = 0;
200 return DecoderResult::Error(BrokenSequence);
201 }
202
203 // Check for the validity of this sequence.
204 if self.lead != 0 {
205 // E0 80 through E0 9F and F0 80 through F0 8F
206 // produce overlong encodings.
207 if self.lead == 0xE0 && byte < 0xA0 ||
208 self.lead == 0xF0 && byte < 0x90 {
209 self.needed = 0;
210 return DecoderResult::Error(OverlongEncoding);
211 }
212
213 // ED 80 through ED 9E and F4 90 through F4 BF
214 // produce invalid unicode scalar values.
215 if self.lead == 0xED && byte > 0x9F ||
216 self.lead == 0xF4 && byte > 0x8F {
217 self.needed = 0;
218 return DecoderResult::Error(InvalidCodePoint);
219 }
220
221 // We only want to do this check for the first continuation byte, so we reset `lead` here.
222 self.lead = 0;
223 }
224
225 self.value = (self.value << 6) | (byte & MASK[0]) as u32;
226 self.needed -= 1;
227
228 // We're done!
229 if self.needed == 0 {
230 return DecoderResult::Char(unsafe {
231 core::char::from_u32_unchecked(self.value)
232 });
233 }
234
235 DecoderResult::Continue
236 }
237}
238
239/// Decodes one character from a byte slice, returning a `Result` and the remainder of the slice.
240///
241/// # Panics
242///
243/// Panics if `bytes` is empty.
244///
245/// # Examples
246///
247/// ```rust
248/// // Valid UTF-8 encoding of '€'
249/// let bytes = [0xE2, 0x82, 0xAC];
250///
251/// let (result, rest) = rut::decode_one(&bytes);
252///
253/// assert_eq!(result, Ok('€'));
254/// assert_eq!(rest, &[]);
255/// ```
256///
257/// ```rust
258/// use rut::Error::*;
259///
260/// // Ill-formed sequence followed by 2 valid characters
261/// let bytes = [0xC2, 0x41, 0x42];
262///
263/// let (result1, rest1) = rut::decode_one(&bytes);
264/// let (result2, rest2) = rut::decode_one(rest1);
265/// let (result3, rest3) = rut::decode_one(rest2);
266///
267/// assert_eq!(result1, Err(BrokenSequence));
268/// assert_eq!(result2, Ok('A'));
269/// assert_eq!(result3, Ok('B'));
270/// assert_eq!(rest1, &[0x41, 0x42]);
271/// assert_eq!(rest2, &[0x42]);
272/// assert_eq!(rest3, &[]);
273/// ```
274pub fn decode_one(bytes: &[u8]) -> (Result<char, Error>, &[u8]) {
275 use self::Error::*;
276
277 // An empty slice would return `TruncatedSequence`, which doesn't make much sense.
278 // Just panic instead.
279 assert!(!bytes.is_empty());
280
281 let mut p = Decoder::new();
282
283 for (idx, &byte) in bytes.iter().enumerate() {
284 match p.decode_byte(byte) {
285 DecoderResult::Continue => continue,
286 DecoderResult::Char(c) => return (Ok(c), &bytes[idx + 1..]),
287 // To match W3C's / Unicode's behavior,
288 // errors which occur inside a sequence require
289 // the current byte to get decoded again.
290 // This is equivalent to step 4.2 in https://www.w3.org/TR/encoding/#utf-8-decoder.
291 DecoderResult::Error(e) => match e {
292 BrokenSequence
293 | OverlongEncoding
294 | InvalidCodePoint => {
295 return (Err(e), &bytes[idx..])
296 }
297 _ => return (Err(e), &bytes[idx + 1..]),
298 },
299 }
300 }
301
302 // We didn't get anything out of the slice (i.e. every byte returned `Continue`).
303 (Err(TruncatedSequence), &[])
304}
305
306/// An iterator that decodes characters from a byte slice.
307///
308/// This `struct` is created by the [`decode`](fn.decode.html) function.
309/// See its documentation for more.
310#[must_use = "iterators are lazy and do nothing unless consumed"]
311#[derive(Clone)]
312pub struct Decode<'a> {
313 bytes: &'a [u8],
314}
315
316/// Creates an iterator for decoding characters from a byte slice.
317///
318/// This is done by calling [`decode_one`](fn.decode_one.html) repeatedly until the slice has been exhausted.
319///
320/// # Examples
321///
322/// ```rust
323/// // Valid UTF-8 encoding of '€'
324/// let bytes = [0xE2, 0x82, 0xAC];
325///
326/// let mut it = rut::decode(&bytes);
327///
328/// assert_eq!(it.next(), Some(Ok('€')));
329/// assert_eq!(it.next(), None);
330/// ```
331///
332/// ```rust
333/// use rut::Error::*;
334///
335/// // Ill-formed sequence followed by 2 valid characters
336/// let bytes = [0xC2, 0x41, 0x42];
337///
338/// let mut it = rut::decode(&bytes);
339///
340/// assert_eq!(it.next(), Some(Err(BrokenSequence)));
341/// assert_eq!(it.next(), Some(Ok('A')));
342/// assert_eq!(it.next(), Some(Ok('B')));
343/// assert_eq!(it.next(), None);
344/// ```
345#[inline]
346pub fn decode(bytes: &[u8]) -> Decode<'_> {
347 Decode { bytes }
348}
349
350impl Decode<'_> {
351 /// Returns the unprocessed part of the stored byte slice.
352 ///
353 /// # Example
354 ///
355 /// ```rust
356 /// // "ABC"
357 /// let bytes = &[0x41, 0x42, 0x43];
358 ///
359 /// let mut d = rut::decode(bytes);
360 ///
361 /// assert_eq!(d.next(), Some(Ok('A')));
362 /// assert_eq!(d.rest(), &[0x42, 0x43]);
363 /// ```
364 pub fn rest(&self) -> &[u8] {
365 self.bytes
366 }
367}
368
369impl ::core::iter::Iterator for Decode<'_> {
370 type Item = Result<char, self::Error>;
371
372 fn next(&mut self) -> Option<Self::Item> {
373 if self.bytes.is_empty() {
374 None
375 } else {
376 let (result, rest) = decode_one(self.bytes);
377 self.bytes = rest;
378 Some(result)
379 }
380 }
381}
382
383impl ::core::iter::FusedIterator for Decode<'_> {}
384
385#[cfg(test)]
386mod tests {
387 use super::{*, Error::*};
388
389 #[test]
390 fn first_valid_1() {
391 let bytes = &[0x00];
392
393 let (result, rest) = decode_one(bytes);
394
395 assert_eq!(result, Ok('\u{0000}'));
396 assert_eq!(rest, &[]);
397 }
398
399 #[test]
400 fn first_valid_2() {
401 let bytes = &[0xC2, 0x80];
402
403 let (result, rest) = decode_one(bytes);
404
405 assert_eq!(result, Ok('\u{0080}'));
406 assert_eq!(rest, &[]);
407 }
408
409 #[test]
410 fn first_valid_3() {
411 let bytes = &[0xE0, 0xA0, 0x80];
412
413 let (result, rest) = decode_one(bytes);
414
415 assert_eq!(result, Ok('\u{0800}'));
416 assert_eq!(rest, &[]);
417 }
418
419 #[test]
420 fn first_valid_4() {
421 let bytes = &[0xF0, 0x90, 0x80, 0x80];
422
423 let (result, rest) = decode_one(bytes);
424
425 assert_eq!(result, Ok('\u{10000}'));
426 assert_eq!(rest, &[]);
427 }
428
429 #[test]
430 fn last_valid_1() {
431 let bytes = &[0x7F];
432
433 let (result, rest) = decode_one(bytes);
434
435 assert_eq!(result, Ok('\u{007F}'));
436 assert_eq!(rest, &[]);
437 }
438
439 #[test]
440 fn last_valid_2() {
441 let bytes = &[0xDF, 0xBF];
442
443 let (result, rest) = decode_one(bytes);
444
445 assert_eq!(result, Ok('\u{07FF}'));
446 assert_eq!(rest, &[]);
447 }
448
449 #[test]
450 fn last_valid_3() {
451 let bytes = &[0xEF, 0xBF, 0xBF];
452
453 let (result, rest) = decode_one(bytes);
454
455 assert_eq!(result, Ok('\u{FFFF}'));
456 assert_eq!(rest, &[]);
457 }
458
459 #[test]
460 fn last_valid_4() {
461 let bytes = &[0xF4, 0x8F, 0xBF, 0xBF];
462
463 let (result, rest) = decode_one(bytes);
464
465 assert_eq!(result, Ok('\u{10FFFF}'));
466 assert_eq!(rest, &[]);
467 }
468
469 #[test]
470 fn first_before_surrogates() {
471 let bytes = &[0xED, 0x9F, 0xBF];
472
473 let (result, rest) = decode_one(bytes);
474
475 assert_eq!(result, Ok('\u{D7FF}'));
476 assert_eq!(rest, &[]);
477 }
478
479 #[test]
480 fn first_after_surrogates() {
481 let bytes = &[0xEE, 0x80, 0x80];
482
483 let (result, rest) = decode_one(bytes);
484
485 assert_eq!(result, Ok('\u{E000}'));
486 assert_eq!(rest, &[]);
487 }
488
489 #[test]
490 fn invalid_bytes() {
491 let bytes = &[0xC0, 0xC1,
492 0xF5, 0xF6, 0xF7, 0xF8,
493 0xF9, 0xFA, 0xFB, 0xFC,
494 0xFD, 0xFE, 0xFF];
495
496 for b in bytes {
497 let slice = core::slice::from_ref(b);
498
499 let (result, rest) = decode_one(slice);
500
501 assert_eq!(result, Err(InvalidByte));
502 assert_eq!(rest, &[]);
503 }
504 }
505
506 #[test]
507 fn continuation_bytes() {
508 for b in 0x80..=0xBF {
509 let slice = core::slice::from_ref(&b);
510
511 let (result, rest) = decode_one(slice);
512
513 assert_eq!(result, Err(UnexpectedContinuation));
514 assert_eq!(rest, &[]);
515 }
516 }
517}