rut/
lib.rs

1#![no_std]
2//! Rut is a small UTF-8 decoding library for applications that need to decode individual characters.\
3//! It provides a bytewise decoder, and functions for decoding byte slices.
4//!
5//! It is completely `no_std` and should provide good performance.<sup>[*citation needed*]</sup>
6//!
7//! # Conformance
8//!
9//! Rut is fully conformant to the specifications and restrictions of the [Unicode standard][unicode].\
10//! Additionally, it follows [W3C's standard for UTF-8 decoding][w3c] with regards to error signalling.
11//!
12//! # Testing
13//!
14//! Some tests are in place, however it is not comprehensive yet.
15//! However, Rut has been pretty thoroughly fuzzed on random input and passes this [stress test for UTF-8 decoders][stress].
16//!
17//! # As Seen on TV!
18//!
19//! Rut began life, and is still used in, [Termiku], a terminal emulator written in Rust.
20//!
21//! [unicode]: https://www.unicode.org/versions/latest/
22//! [w3c]: https://www.w3.org/TR/encoding/#utf-8-decoder
23//! [stress]: https://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
24//! [Termiku]: https://github.com/ShinySaana/Termiku
25
26/// Bitmasks for UTF-8 bytes.
27/// 10xx xxxx for continuation bytes,
28/// 110x xxxx for 2 byte,
29/// 1110 xxxx for 3 byte, and
30/// 1111 0xxx for 4 byte leading bytes.
31const MASK: [u8; 4] = [0x3F, 0x1F, 0x0F, 0x07];
32
33/// UTF-8 length table.
34/// This is blatantly taken from Rust's corelib.
35/// However, the API for it is unstable, so we supply our own.
36/// https://doc.rust-lang.org/stable/core/str/fn.utf8_char_width.html
37const UTF8_LENGTH_TABLE: [u8; 256] = [
38    // 0x00-0x7F: 1 byte.
39    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
40    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
41    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
42    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
43    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
44    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
45    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
46    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
47    // 0x80-0xBF: continuation bytes.
48    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
49    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
50    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
51    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
52    // 0xC2-0xDF: 2 bytes.
53    0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
54    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
55    // 0xE0-0xEF: 3 bytes.
56    3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
57    // 0xF0-0xF4: 4 bytes.
58    4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0,
59];
60
61/// Result type for the [`Decoder::decode_byte`](struct.Decoder.html#method.decode_byte) method.
62///
63/// Note that this is *not* an alias for Rust's default `Result` type.
64#[derive(Copy, Clone, Eq, PartialEq, Debug)]
65pub enum DecoderResult {
66    /// A byte was decoded successfully.
67    Continue,
68    /// A character was decoded successfully.
69    Char(char),
70    /// An error occured.
71    Error(self::Error),
72}
73
74/// The error type returned by all decoding methods.
75#[derive(Copy, Clone, Eq, PartialEq, Debug)]
76pub enum Error {
77    /// An invalid byte value was encountered.
78    InvalidByte,
79
80    /// A continuation byte was encountered outside of a sequence.
81    UnexpectedContinuation,
82
83    /// A sequence was terminated by a byte that isn't a valid continuation.
84    BrokenSequence,
85
86    /// An [overlong encoding](https://en.wikipedia.org/wiki/UTF-8#Overlong_encodings) was encountered.
87    OverlongEncoding,
88
89    /// An encoding for an invalid unicode scalar value was encountered.
90    InvalidCodePoint,
91
92    /// The end of the sequence was reached before it could be fully decoded.
93    TruncatedSequence,
94}
95
96/// A bytewise UTF-8 decoder.
97#[derive(Copy, Clone, Debug, Default)]
98pub struct Decoder {
99    /// Unicode Scalar Value decoded up to this point.
100    value: u32,
101    /// Number of continuation bytes in this sequence.
102    needed: u8,
103    /// Leading byte of this sequence.
104    lead: u8,
105}
106
107impl Decoder {
108    /// Creates a new decoder.
109    #[inline]
110    pub fn new() -> Decoder {
111        Decoder::default()
112    }
113
114    /// Decodes a single byte.
115    ///
116    /// # Correct Use
117    ///
118    /// Due to the nature of bytewise UTF-8 decoding, blindly continuing through errors will lead to vastly incorrect results.
119    ///
120    /// For example, the byte sequence `C2, 41, 42` would return the results `<Continue>, <Error>, B`.\
121    /// However, Unicode expects this sequence to be treated as `<Error>, A, B`.
122    ///
123    /// This means that, when an error is encountered inside of a sequence,
124    /// the offending byte should be passsed to `decode_byte` again.\
125    /// This is because an ill-formed sequence ends at the last valid byte, not at the first invalid byte.
126    ///
127    /// This is essentially what the [`decode_one`](fn.decode_one.html) function does.
128    ///
129    /// # Example
130    ///
131    /// Performing lossy decoding with `Decoder`:
132    ///
133    /// ```
134    /// use std::fmt::Write;
135    /// use rut::{Decoder, DecoderResult, Error::*};
136    ///
137    /// let mut s = String::new();
138    /// let mut d = Decoder::new();
139    ///
140    /// // An ill-formed UTF-8 sequence as described above.
141    /// let bytes = [0xC2, 0x41, 0x42];
142    ///
143    /// for &byte in &bytes {
144    ///     let mut result = d.decode_byte(byte);
145    ///
146    ///     if let DecoderResult::Error(e) = result {
147    ///         match e {
148    ///             BrokenSequence
149    ///             | OverlongEncoding
150    ///             | InvalidCodePoint => {
151    ///                 write!(&mut s, "\u{FFFD}").unwrap();
152    ///                 result = d.decode_byte(byte);
153    ///             },
154    ///             _ => {}
155    ///         }
156    ///     }
157    ///
158    ///     match result {
159    ///         DecoderResult::Continue => continue,
160    ///         DecoderResult::Char(c)  => write!(&mut s, "{}", c).unwrap(),
161    ///         DecoderResult::Error(_) => write!(&mut s, "\u{FFFD}").unwrap()
162    ///     }
163    /// }
164    ///
165    /// assert_eq!(&s, "�AB");
166    /// ```
167    pub fn decode_byte(&mut self, byte: u8) -> DecoderResult {
168        use self::Error::*;
169
170        // These values can never appear.
171        // More specifically, these would be leading bytes.
172        // C0 and C1 can only produce overlong encodings of U+0000 through U+007F
173        // F5 through FF can only produce scalar values greater than U+10FFFF, or encodings longer than 4 bytes.
174        match byte {
175            0xC0 | 0xC1 | 0xF5..=0xFF => {
176                self.needed = 0;
177                return DecoderResult::Error(InvalidByte);
178            }
179            _ => {}
180        }
181
182        if self.needed == 0 {
183            match byte {
184                0x00..=0x7F => return DecoderResult::Char(char::from(byte)),
185                0x80..=0xBF => return DecoderResult::Error(UnexpectedContinuation),
186                _ => {}
187            }
188
189            let needed = UTF8_LENGTH_TABLE[byte as usize] - 1;
190
191            self.value = (byte & MASK[needed as usize]) as u32;
192            self.needed = needed as u8;
193            self.lead = byte;
194
195            return DecoderResult::Continue;
196        }
197
198        if !(0x80..=0xBF).contains(&byte) {
199            self.needed = 0;
200            return DecoderResult::Error(BrokenSequence);
201        }
202
203        // Check for the validity of this sequence.
204        if self.lead != 0 {
205            // E0 80 through E0 9F and F0 80 through F0 8F
206            // produce overlong encodings.
207            if self.lead == 0xE0 && byte < 0xA0 ||
208               self.lead == 0xF0 && byte < 0x90 {
209                self.needed = 0;
210                return DecoderResult::Error(OverlongEncoding);
211            }
212
213            // ED 80 through ED 9E and F4 90 through F4 BF
214            // produce invalid unicode scalar values.
215            if self.lead == 0xED && byte > 0x9F ||
216               self.lead == 0xF4 && byte > 0x8F {
217                self.needed = 0;
218                return DecoderResult::Error(InvalidCodePoint);
219            }
220
221            // We only want to do this check for the first continuation byte, so we reset `lead` here.
222            self.lead = 0;
223        }
224
225        self.value = (self.value << 6) | (byte & MASK[0]) as u32;
226        self.needed -= 1;
227
228        // We're done!
229        if self.needed == 0 {
230            return DecoderResult::Char(unsafe {
231                core::char::from_u32_unchecked(self.value)
232            });
233        }
234
235        DecoderResult::Continue
236    }
237}
238
239/// Decodes one character from a byte slice, returning a `Result` and the remainder of the slice.
240///
241/// # Panics
242///
243/// Panics if `bytes` is empty.
244///
245/// # Examples
246///
247/// ```rust
248/// // Valid UTF-8 encoding of '€'
249/// let bytes = [0xE2, 0x82, 0xAC];
250///
251/// let (result, rest) = rut::decode_one(&bytes);
252///
253/// assert_eq!(result, Ok('€'));
254/// assert_eq!(rest, &[]);
255/// ```
256///
257/// ```rust
258/// use rut::Error::*;
259///
260/// // Ill-formed sequence followed by 2 valid characters
261/// let bytes = [0xC2, 0x41, 0x42];
262///
263/// let (result1, rest1) = rut::decode_one(&bytes);
264/// let (result2, rest2) = rut::decode_one(rest1);
265/// let (result3, rest3) = rut::decode_one(rest2);
266///
267/// assert_eq!(result1, Err(BrokenSequence));
268/// assert_eq!(result2, Ok('A'));
269/// assert_eq!(result3, Ok('B'));
270/// assert_eq!(rest1, &[0x41, 0x42]);
271/// assert_eq!(rest2, &[0x42]);
272/// assert_eq!(rest3, &[]);
273/// ```
274pub fn decode_one(bytes: &[u8]) -> (Result<char, Error>, &[u8]) {
275    use self::Error::*;
276
277    // An empty slice would return `TruncatedSequence`, which doesn't make much sense.
278    // Just panic instead.
279    assert!(!bytes.is_empty());
280
281    let mut p = Decoder::new();
282
283    for (idx, &byte) in bytes.iter().enumerate() {
284        match p.decode_byte(byte) {
285            DecoderResult::Continue => continue,
286            DecoderResult::Char(c)  => return (Ok(c), &bytes[idx + 1..]),
287            // To match W3C's / Unicode's behavior,
288            // errors which occur inside a sequence require
289            // the current byte to get decoded again.
290            // This is equivalent to step 4.2 in https://www.w3.org/TR/encoding/#utf-8-decoder.
291            DecoderResult::Error(e) => match e {
292                BrokenSequence
293                | OverlongEncoding
294                | InvalidCodePoint => {
295                    return (Err(e), &bytes[idx..])
296                }
297                _ => return (Err(e), &bytes[idx + 1..]),
298            },
299        }
300    }
301
302    // We didn't get anything out of the slice (i.e. every byte returned `Continue`).
303    (Err(TruncatedSequence), &[])
304}
305
306/// An iterator that decodes characters from a byte slice.
307///
308/// This `struct` is created by the [`decode`](fn.decode.html) function.
309/// See its documentation for more.
310#[must_use = "iterators are lazy and do nothing unless consumed"]
311#[derive(Clone)]
312pub struct Decode<'a> {
313    bytes: &'a [u8],
314}
315
316/// Creates an iterator for decoding characters from a byte slice.
317///
318/// This is done by calling [`decode_one`](fn.decode_one.html) repeatedly until the slice has been exhausted.
319///
320/// # Examples
321///
322/// ```rust
323/// // Valid UTF-8 encoding of '€'
324/// let bytes = [0xE2, 0x82, 0xAC];
325///
326/// let mut it = rut::decode(&bytes);
327///
328/// assert_eq!(it.next(), Some(Ok('€')));
329/// assert_eq!(it.next(), None);
330/// ```
331///
332/// ```rust
333/// use rut::Error::*;
334///
335/// // Ill-formed sequence followed by 2 valid characters
336/// let bytes = [0xC2, 0x41, 0x42];
337///
338/// let mut it = rut::decode(&bytes);
339///
340/// assert_eq!(it.next(), Some(Err(BrokenSequence)));
341/// assert_eq!(it.next(), Some(Ok('A')));
342/// assert_eq!(it.next(), Some(Ok('B')));
343/// assert_eq!(it.next(), None);
344/// ```
345#[inline]
346pub fn decode(bytes: &[u8]) -> Decode<'_> {
347    Decode { bytes }
348}
349
350impl Decode<'_> {
351    /// Returns the unprocessed part of the stored byte slice.
352    ///
353    /// # Example
354    ///
355    /// ```rust
356    /// // "ABC"
357    /// let bytes = &[0x41, 0x42, 0x43];
358    ///
359    /// let mut d = rut::decode(bytes);
360    ///
361    /// assert_eq!(d.next(), Some(Ok('A')));
362    /// assert_eq!(d.rest(), &[0x42, 0x43]);
363    /// ```
364    pub fn rest(&self) -> &[u8] {
365        self.bytes
366    }
367}
368
369impl ::core::iter::Iterator for Decode<'_> {
370    type Item = Result<char, self::Error>;
371
372    fn next(&mut self) -> Option<Self::Item> {
373        if self.bytes.is_empty() {
374            None
375        } else {
376            let (result, rest) = decode_one(self.bytes);
377            self.bytes = rest;
378            Some(result)
379        }
380    }
381}
382
383impl ::core::iter::FusedIterator for Decode<'_> {}
384
385#[cfg(test)]
386mod tests {
387    use super::{*, Error::*};
388
389    #[test]
390    fn first_valid_1() {
391        let bytes = &[0x00];
392
393        let (result, rest) = decode_one(bytes);
394
395        assert_eq!(result, Ok('\u{0000}'));
396        assert_eq!(rest, &[]);
397    }
398
399    #[test]
400    fn first_valid_2() {
401        let bytes = &[0xC2, 0x80];
402
403        let (result, rest) = decode_one(bytes);
404
405        assert_eq!(result, Ok('\u{0080}'));
406        assert_eq!(rest, &[]);
407    }
408
409    #[test]
410    fn first_valid_3() {
411        let bytes = &[0xE0, 0xA0, 0x80];
412
413        let (result, rest) = decode_one(bytes);
414
415        assert_eq!(result, Ok('\u{0800}'));
416        assert_eq!(rest, &[]);
417    }
418
419    #[test]
420    fn first_valid_4() {
421        let bytes = &[0xF0, 0x90, 0x80, 0x80];
422
423        let (result, rest) = decode_one(bytes);
424
425        assert_eq!(result, Ok('\u{10000}'));
426        assert_eq!(rest, &[]);
427    }
428
429    #[test]
430    fn last_valid_1() {
431        let bytes = &[0x7F];
432
433        let (result, rest) = decode_one(bytes);
434
435        assert_eq!(result, Ok('\u{007F}'));
436        assert_eq!(rest, &[]);
437    }
438
439    #[test]
440    fn last_valid_2() {
441        let bytes = &[0xDF, 0xBF];
442
443        let (result, rest) = decode_one(bytes);
444
445        assert_eq!(result, Ok('\u{07FF}'));
446        assert_eq!(rest, &[]);
447    }
448
449    #[test]
450    fn last_valid_3() {
451        let bytes = &[0xEF, 0xBF, 0xBF];
452
453        let (result, rest) = decode_one(bytes);
454
455        assert_eq!(result, Ok('\u{FFFF}'));
456        assert_eq!(rest, &[]);
457    }
458
459    #[test]
460    fn last_valid_4() {
461        let bytes = &[0xF4, 0x8F, 0xBF, 0xBF];
462
463        let (result, rest) = decode_one(bytes);
464
465        assert_eq!(result, Ok('\u{10FFFF}'));
466        assert_eq!(rest, &[]);
467    }
468
469    #[test]
470    fn first_before_surrogates() {
471        let bytes = &[0xED, 0x9F, 0xBF];
472
473        let (result, rest) = decode_one(bytes);
474
475        assert_eq!(result, Ok('\u{D7FF}'));
476        assert_eq!(rest, &[]);
477    }
478
479    #[test]
480    fn first_after_surrogates() {
481        let bytes = &[0xEE, 0x80, 0x80];
482
483        let (result, rest) = decode_one(bytes);
484
485        assert_eq!(result, Ok('\u{E000}'));
486        assert_eq!(rest, &[]);
487    }
488
489    #[test]
490    fn invalid_bytes() {
491        let bytes = &[0xC0, 0xC1,
492                      0xF5, 0xF6, 0xF7, 0xF8,
493                      0xF9, 0xFA, 0xFB, 0xFC,
494                      0xFD, 0xFE, 0xFF];
495
496        for b in bytes {
497            let slice = core::slice::from_ref(b);
498
499            let (result, rest) = decode_one(slice);
500
501            assert_eq!(result, Err(InvalidByte));
502            assert_eq!(rest, &[]);
503        }
504    }
505
506    #[test]
507    fn continuation_bytes() {
508        for b in 0x80..=0xBF {
509            let slice = core::slice::from_ref(&b);
510
511            let (result, rest) =  decode_one(slice);
512
513            assert_eq!(result, Err(UnexpectedContinuation));
514            assert_eq!(rest, &[]);
515        }
516    }
517}