encode_unicode/
errors.rs

1/* Copyright 2016-2022 Torbjørn Birch Moltu
2 *
3 * Licensed under the Apache License, Version 2.0, <LICENSE-APACHE or
4 * http://apache.org/licenses/LICENSE-2.0> or the MIT license <LICENSE-MIT or
5 * http://opensource.org/licenses/MIT>, at your option. This file may not be
6 * copied, modified, or distributed except according to those terms.
7 */
8
9
10//! Boilerplate-y error types.
11//!
12//! The discriminant values of the enums might change in minor releases.
13//! (to reduce the size of the `Result<>` types they are returned in)
14
15extern crate core;
16use core::fmt::{self,Display,Formatter};
17use core::ops::RangeInclusive;
18#[cfg(feature="std")]
19use std::error::Error;
20
21
22macro_rules! description {($err:ty, $desc:expr) => {
23    #[cfg(not(feature="std"))]
24    impl $err {
25        #[allow(missing_docs)]
26        pub fn description(&self) -> &'static str {
27            ($desc)(self)
28        }
29    }
30    #[cfg(feature="std")]
31    impl Error for $err {
32        fn description(&self) -> &'static str {
33            ($desc)(self)
34        }
35    }
36    impl Display for $err {
37        fn fmt(&self,  fmtr: &mut Formatter) -> fmt::Result {
38            #![allow(deprecated)] // calling our own function
39            write!(fmtr, "{}", self.description())
40        }
41    }
42}}
43
44
45macro_rules! single_cause {($(#[$doc:meta])* $err:ident => $desc:expr) => {
46    $(#[$doc])*
47    #[derive(Clone,Copy, Debug, PartialEq,Eq)]
48    pub struct $err;
49    description!{$err, |_| $desc }
50}}
51
52
53single_cause!{
54    /// Error returned by [`U16UtfExt::utf16_needs_extra_unit()`](../trait.U16UtfExt.html#tymethod.utf16_needs_extra_unit)
55    /// when called on an `u16` that's a trailing surrogate.
56    Utf16FirstUnitError => "is a trailing surrogate"
57}
58
59single_cause!{
60    /// Error returned by [`Utf8Char::from_ascii()`](../struct.Utf8Char.html#method.from_ascii)
61    /// for bytes that are not ASCII characters.
62    NonAsciiError => "not an ASCII character"
63}
64
65single_cause!{
66    /// Error returned by [`Utf16Char::from_bmp()`](../struct.Utf16Char.html#method.from_bmp)
67    /// for units that are not a standalone codepoint.
68    NonBmpError => "not a codepoint in the basic multilingual plane"
69}
70
71single_cause!{
72    /// Error returned by [`Utf8Char::from_str_start()`](../struct.Utf8Char.html#method.from_str_start)
73    /// and [`Utf16Char::from_str_start()`](../struct.Utf16Char.html#method.from_str_start)
74    /// when called with an empty string.
75    EmptyStrError => "is empty"
76}
77
78
79
80macro_rules! simple {($(#[$tydoc:meta])* $err:ident {
81                          $( $(#[$vardoc:meta])* $variant:ident => $string:expr, )+
82                      } ) => {
83    $(#[$tydoc])*
84    #[derive(Clone,Copy, Debug, PartialEq,Eq)]
85    pub enum $err {
86        $( $(#[$vardoc])* $variant, )*
87    }
88    description!{$err, |e: &$err| match *e {$($err::$variant => $string),*} }
89}}
90
91
92simple!{
93    /// Error returned when an `u32` is not a valid unicode codepoint.
94    CodepointError {
95        /// It's reserved for UTF-16 surrogate pairs.
96        Utf16Reserved => "is reserved for UTF-16 surrogate pairs",
97        /// It's higher than the highest codepoint (which is 0x10ffff).
98        TooHigh => "is higher than the highest codepoint",
99    }}
100use CodepointError::*;
101impl CodepointError {
102    /// Get the range of values for which this error would be given.
103    pub const fn error_range(self) -> RangeInclusive<u32> {match self {
104        Utf16Reserved => 0xd8_00..=0xdf_ff,
105        TooHigh => 0x00_10_ff_ff..=0xff_ff_ff_ff,
106    }}
107}
108
109
110simple!{
111    /// Error returned when an `[u16; 2]` doesn't form a valid UTF-16 codepoint.
112    Utf16ArrayError {
113        /// The first element is a trailing / low surrogate, which is never valid.
114        FirstIsTrailingSurrogate => "the first element is a trailing surrogate",
115        /// The second element is needed, but is not a trailing surrogate.
116        SecondIsNotTrailingSurrogate => "the second element is needed but is not a trailing surrogate",
117    }}
118
119simple!{
120    /// Error returned when one or two `u16`s are not valid UTF-16.
121    ///
122    /// They are returned in sinking precedence;
123    /// The condition that causes the first variant to be returned is checked
124    /// for before the condition the next variant is returned for.
125    Utf16TupleError {
126        /// The first unit is a trailing / low surrogate, which is never valid.
127        FirstIsTrailingSurrogate => "the first unit is a trailing surrogate",
128        /// The provided second unit is not necessary.
129        SuperfluousSecond => "the second unit is superfluous",
130        /// The first and only unit requires a second unit.
131        MissingSecond => "the first unit requires a second unit",
132        /// The second unit is needed and was provided, but is not a trailing surrogate.
133        SecondIsNotTrailingSurrogate => "the required second unit is not a trailing surrogate",
134    }}
135
136
137simple!{
138    /// Error returned when a slice of `u16`s doesn't start with valid UTF-16.
139    Utf16SliceError {
140        /// The slice is empty.
141        EmptySlice => "the slice is empty",
142        /// The first unit is a trailing surrogate.
143        FirstIsTrailingSurrogate => "the first unit is a trailing surrogate",
144        /// The first and only unit requires a second unit.
145        MissingSecond => "the first and only unit requires a second one",
146        /// The first unit requires a second one, but it's not a trailing surrogate.
147        SecondIsNotTrailingSurrogate => "the required second unit is not a trailing surrogate",
148    }}
149
150simple!{
151    /// Error returned by [`Utf16CharDecoder`](../iterator/struct.Utf16CharMerger.html#impl-Iterator)
152    /// when it encounters an invalid sequence.
153    Utf16PairError {
154        /// A trailing surrogate was not preceeded by a leading surrogate.
155        UnexpectedTrailingSurrogate => "a trailing surrogate was not preceeded by a leading surrogate",
156        /// A leading surrogate was followed by an unit that was not a trailing surrogate.
157        UnmatchedLeadingSurrogate => "a leading surrogate was followed by an unit that was not a trailing surrogate",
158        /// A trailing surrogate was expected when the end was reached.
159        Incomplete => "a trailing surrogate was expected when the end was reached",
160    }}
161
162
163simple!{
164    /// Error returned when [`Utf8Char::from_str()`](../struct.Utf8Char.html#impl-FromStr)
165    /// or [`Utf16Char::from_str()`](../struct.Utf16Char.html#impl-FromStr) fails.
166    FromStrError {
167        /// `Utf8Char` and `Utf16Char` cannot store more than a single codepoint.
168        MultipleCodepoints => "contains more than one codepoint",
169        /// `Utf8Char` and `Utf16Char` cannot be empty.
170        Empty => "is empty",
171    }
172}
173
174
175
176/// Error returned when an invalid UTF-8 sequence is encountered.
177///
178/// See [`Utf8ErrorKind`](enum.Utf8ErrorKind.html) for the types of errors
179/// that this type can be returned for.
180#[derive(Clone,Copy, Debug, PartialEq,Eq)]
181pub struct Utf8Error {
182    pub(crate) kind: Utf8ErrorKind,
183}
184impl Utf8Error {
185    /// Get the type of error.
186    pub const fn kind(&self) -> Utf8ErrorKind {
187        self.kind
188    }
189
190    #[cfg(not(feature="std"))]
191    #[allow(missing_docs)]
192    pub const fn description(&self) -> &'static str {
193        utf8_error_description(self.kind)
194    }
195}
196#[cfg(feature="std")]
197impl Error for Utf8Error {
198    fn description(&self) -> &'static str {
199        utf8_error_description(self.kind)
200    }
201}
202impl Display for Utf8Error {
203    fn fmt(&self,  fmtr: &mut Formatter) -> fmt::Result {
204        fmtr.write_str(utf8_error_description(self.kind))
205    }
206}
207
208/// The types of errors that can occur when decoding a UTF-8 codepoint.
209///
210/// The variants are more technical than what an end user is likely interested
211/// in, but might be useful for deciding how to handle the error.
212///
213/// They can be grouped into three categories:
214/// * Will happen regularly if decoding chunked or buffered text: `TooFewBytes`.
215/// * Input might be binary, a different encoding or corrupted, `UnexpectedContinuationByte`
216///   and `InterruptedSequence`.  
217///   (Broken UTF-8 sequence).
218/// * Less likely to happen accidentaly and might be malicious:
219///   `OverlongEncoding`, `Utf16ReservedCodepoint` and `TooHighCodepoint`.
220///   Note that theese can still be caused by certain valid latin-1 strings
221///   such as `"Á©"` (`b"\xC1\xA9"`).
222#[derive(Clone,Copy, Debug, PartialEq,Eq)]
223pub enum Utf8ErrorKind {
224    /// There are too few bytes to decode the codepoint.
225    ///
226    /// This can happen when a slice is empty or too short, or an iterator
227    /// returned `None` while in the middle of a codepoint.  
228    /// This error is never produced by functions accepting fixed-size
229    /// `[u8; 4]` arrays.
230    ///
231    /// If decoding text coming chunked (such as in buffers passed to `Read`),
232    /// the remaing bytes should be carried over into the next chunk or buffer.
233    /// (including the byte this error was produced for.)
234    TooFewBytes,
235    /// A byte which is never used by well-formed UTF-8 was encountered.
236    ///
237    /// This means that the input is using a different encoding,
238    /// is corrupted or binary.
239    ///
240    /// This error is returned when a byte in the following ranges
241    /// is encountered anywhere in an UTF-8 sequence:
242    ///
243    /// * `192` and `193` (`0b1100_000x`): Indicates an overlong encoding
244    ///   of a single-byte, ASCII, character, and should therefore never occur.
245    /// * `248..` (`0b1111_1xxx`): Sequences cannot be longer than 4 bytes.
246    /// * `245..=247` (`0b1111_0101 | 0b1111_0110`): Indicates a too high
247    ///   codepoint. (above `\u10ffff`)
248    NonUtf8Byte,
249    /// The first byte is not a valid start of a codepoint.
250    ///
251    /// This might happen as a result of slicing into the middle of a codepoint,
252    /// the input not being UTF-8 encoded or being corrupted.
253    /// Errors of this type coming right after another error should probably
254    /// be ignored, unless returned more than three times in a row.
255    ///
256    /// This error is returned when the first byte has a value in the range
257    /// `128..=191` (`0b1000_0000..=0b1011_1111`).
258    UnexpectedContinuationByte,
259    /// The byte at index 1..=3 should be a continuation byte,
260    /// but doesn't fit the pattern `0b10xx_xxxx`.
261    ///
262    /// When the input slice or iterator has too few bytes,
263    /// [`TooFewBytes`](#Incomplete) is returned instead.
264    InterruptedSequence,
265    /// The encoding of the codepoint has so many leading zeroes that it
266    /// could be a byte shorter.
267    ///
268    /// [Successfully decoding this can present a security issue](https://tools.ietf.org/html/rfc3629#section-10):
269    /// Doing so could allow an attacker to circumvent input validation that
270    /// only checks for ASCII characters, and input characters or strings that
271    /// would otherwise be rejected, such as `/../`.
272    ///
273    /// This error is only returned for 3 and 4-byte encodings;
274    /// `NonUtf8Byte` is returned for bytes that start longer or shorter
275    /// overlong encodings.
276    OverlongEncoding,
277    /// The codepoint is reserved for UTF-16 surrogate pairs.
278    ///
279    /// (`Utf8Char` cannot be used to work with the
280    /// [WTF-8](https://simonsapin.github.io/wtf-8) encoding for UCS-2 strings.)
281    ///
282    /// This error is returned for codepoints in the range `\ud800`..=`\udfff`.
283    /// (which are three bytes long as UTF-8)
284    Utf16ReservedCodepoint,
285    /// The codepoint is higher than `\u10ffff`, which is the highest codepoint
286    /// unicode permits.
287    TooHighCodepoint,
288}
289const fn utf8_error_description(kind: Utf8ErrorKind) -> &'static str {
290    match kind {
291        Utf8ErrorKind::TooFewBytes => "too few bytes",
292        Utf8ErrorKind::NonUtf8Byte => "not UTF-8",
293        Utf8ErrorKind::UnexpectedContinuationByte => "not UTF-8",
294        Utf8ErrorKind::InterruptedSequence => "not UTF-8",
295        Utf8ErrorKind::OverlongEncoding => "malformed input",
296        Utf8ErrorKind::Utf16ReservedCodepoint => "malformed input",
297        Utf8ErrorKind::TooHighCodepoint => "invalid character",
298    }
299}
300impl PartialEq<Utf8ErrorKind> for Utf8Error {
301    fn eq(&self,  kind: &Utf8ErrorKind) -> bool {
302        self.kind == *kind
303    }
304}
305impl PartialEq<Utf8Error> for Utf8ErrorKind {
306    fn eq(&self,  error: &Utf8Error) -> bool {
307        *self == error.kind
308    }
309}