encode_unicode/errors.rs
1/* Copyright 2016-2022 Torbjørn Birch Moltu
2 *
3 * Licensed under the Apache License, Version 2.0, <LICENSE-APACHE or
4 * http://apache.org/licenses/LICENSE-2.0> or the MIT license <LICENSE-MIT or
5 * http://opensource.org/licenses/MIT>, at your option. This file may not be
6 * copied, modified, or distributed except according to those terms.
7 */
8
9
10//! Boilerplate-y error types.
11//!
12//! The discriminant values of the enums might change in minor releases.
13//! (to reduce the size of the `Result<>` types they are returned in)
14
15extern crate core;
16use core::fmt::{self,Display,Formatter};
17use core::ops::RangeInclusive;
18#[cfg(feature="std")]
19use std::error::Error;
20
21
22macro_rules! description {($err:ty, $desc:expr) => {
23 #[cfg(not(feature="std"))]
24 impl $err {
25 #[allow(missing_docs)]
26 pub fn description(&self) -> &'static str {
27 ($desc)(self)
28 }
29 }
30 #[cfg(feature="std")]
31 impl Error for $err {
32 fn description(&self) -> &'static str {
33 ($desc)(self)
34 }
35 }
36 impl Display for $err {
37 fn fmt(&self, fmtr: &mut Formatter) -> fmt::Result {
38 #![allow(deprecated)] // calling our own function
39 write!(fmtr, "{}", self.description())
40 }
41 }
42}}
43
44
45macro_rules! single_cause {($(#[$doc:meta])* $err:ident => $desc:expr) => {
46 $(#[$doc])*
47 #[derive(Clone,Copy, Debug, PartialEq,Eq)]
48 pub struct $err;
49 description!{$err, |_| $desc }
50}}
51
52
53single_cause!{
54 /// Error returned by [`U16UtfExt::utf16_needs_extra_unit()`](../trait.U16UtfExt.html#tymethod.utf16_needs_extra_unit)
55 /// when called on an `u16` that's a trailing surrogate.
56 Utf16FirstUnitError => "is a trailing surrogate"
57}
58
59single_cause!{
60 /// Error returned by [`Utf8Char::from_ascii()`](../struct.Utf8Char.html#method.from_ascii)
61 /// for bytes that are not ASCII characters.
62 NonAsciiError => "not an ASCII character"
63}
64
65single_cause!{
66 /// Error returned by [`Utf16Char::from_bmp()`](../struct.Utf16Char.html#method.from_bmp)
67 /// for units that are not a standalone codepoint.
68 NonBmpError => "not a codepoint in the basic multilingual plane"
69}
70
71single_cause!{
72 /// Error returned by [`Utf8Char::from_str_start()`](../struct.Utf8Char.html#method.from_str_start)
73 /// and [`Utf16Char::from_str_start()`](../struct.Utf16Char.html#method.from_str_start)
74 /// when called with an empty string.
75 EmptyStrError => "is empty"
76}
77
78
79
80macro_rules! simple {($(#[$tydoc:meta])* $err:ident {
81 $( $(#[$vardoc:meta])* $variant:ident => $string:expr, )+
82 } ) => {
83 $(#[$tydoc])*
84 #[derive(Clone,Copy, Debug, PartialEq,Eq)]
85 pub enum $err {
86 $( $(#[$vardoc])* $variant, )*
87 }
88 description!{$err, |e: &$err| match *e {$($err::$variant => $string),*} }
89}}
90
91
92simple!{
93 /// Error returned when an `u32` is not a valid unicode codepoint.
94 CodepointError {
95 /// It's reserved for UTF-16 surrogate pairs.
96 Utf16Reserved => "is reserved for UTF-16 surrogate pairs",
97 /// It's higher than the highest codepoint (which is 0x10ffff).
98 TooHigh => "is higher than the highest codepoint",
99 }}
100use CodepointError::*;
101impl CodepointError {
102 /// Get the range of values for which this error would be given.
103 pub const fn error_range(self) -> RangeInclusive<u32> {match self {
104 Utf16Reserved => 0xd8_00..=0xdf_ff,
105 TooHigh => 0x00_10_ff_ff..=0xff_ff_ff_ff,
106 }}
107}
108
109
110simple!{
111 /// Error returned when an `[u16; 2]` doesn't form a valid UTF-16 codepoint.
112 Utf16ArrayError {
113 /// The first element is a trailing / low surrogate, which is never valid.
114 FirstIsTrailingSurrogate => "the first element is a trailing surrogate",
115 /// The second element is needed, but is not a trailing surrogate.
116 SecondIsNotTrailingSurrogate => "the second element is needed but is not a trailing surrogate",
117 }}
118
119simple!{
120 /// Error returned when one or two `u16`s are not valid UTF-16.
121 ///
122 /// They are returned in sinking precedence;
123 /// The condition that causes the first variant to be returned is checked
124 /// for before the condition the next variant is returned for.
125 Utf16TupleError {
126 /// The first unit is a trailing / low surrogate, which is never valid.
127 FirstIsTrailingSurrogate => "the first unit is a trailing surrogate",
128 /// The provided second unit is not necessary.
129 SuperfluousSecond => "the second unit is superfluous",
130 /// The first and only unit requires a second unit.
131 MissingSecond => "the first unit requires a second unit",
132 /// The second unit is needed and was provided, but is not a trailing surrogate.
133 SecondIsNotTrailingSurrogate => "the required second unit is not a trailing surrogate",
134 }}
135
136
137simple!{
138 /// Error returned when a slice of `u16`s doesn't start with valid UTF-16.
139 Utf16SliceError {
140 /// The slice is empty.
141 EmptySlice => "the slice is empty",
142 /// The first unit is a trailing surrogate.
143 FirstIsTrailingSurrogate => "the first unit is a trailing surrogate",
144 /// The first and only unit requires a second unit.
145 MissingSecond => "the first and only unit requires a second one",
146 /// The first unit requires a second one, but it's not a trailing surrogate.
147 SecondIsNotTrailingSurrogate => "the required second unit is not a trailing surrogate",
148 }}
149
150simple!{
151 /// Error returned by [`Utf16CharDecoder`](../iterator/struct.Utf16CharMerger.html#impl-Iterator)
152 /// when it encounters an invalid sequence.
153 Utf16PairError {
154 /// A trailing surrogate was not preceeded by a leading surrogate.
155 UnexpectedTrailingSurrogate => "a trailing surrogate was not preceeded by a leading surrogate",
156 /// A leading surrogate was followed by an unit that was not a trailing surrogate.
157 UnmatchedLeadingSurrogate => "a leading surrogate was followed by an unit that was not a trailing surrogate",
158 /// A trailing surrogate was expected when the end was reached.
159 Incomplete => "a trailing surrogate was expected when the end was reached",
160 }}
161
162
163simple!{
164 /// Error returned when [`Utf8Char::from_str()`](../struct.Utf8Char.html#impl-FromStr)
165 /// or [`Utf16Char::from_str()`](../struct.Utf16Char.html#impl-FromStr) fails.
166 FromStrError {
167 /// `Utf8Char` and `Utf16Char` cannot store more than a single codepoint.
168 MultipleCodepoints => "contains more than one codepoint",
169 /// `Utf8Char` and `Utf16Char` cannot be empty.
170 Empty => "is empty",
171 }
172}
173
174
175
176/// Error returned when an invalid UTF-8 sequence is encountered.
177///
178/// See [`Utf8ErrorKind`](enum.Utf8ErrorKind.html) for the types of errors
179/// that this type can be returned for.
180#[derive(Clone,Copy, Debug, PartialEq,Eq)]
181pub struct Utf8Error {
182 pub(crate) kind: Utf8ErrorKind,
183}
184impl Utf8Error {
185 /// Get the type of error.
186 pub const fn kind(&self) -> Utf8ErrorKind {
187 self.kind
188 }
189
190 #[cfg(not(feature="std"))]
191 #[allow(missing_docs)]
192 pub const fn description(&self) -> &'static str {
193 utf8_error_description(self.kind)
194 }
195}
196#[cfg(feature="std")]
197impl Error for Utf8Error {
198 fn description(&self) -> &'static str {
199 utf8_error_description(self.kind)
200 }
201}
202impl Display for Utf8Error {
203 fn fmt(&self, fmtr: &mut Formatter) -> fmt::Result {
204 fmtr.write_str(utf8_error_description(self.kind))
205 }
206}
207
208/// The types of errors that can occur when decoding a UTF-8 codepoint.
209///
210/// The variants are more technical than what an end user is likely interested
211/// in, but might be useful for deciding how to handle the error.
212///
213/// They can be grouped into three categories:
214/// * Will happen regularly if decoding chunked or buffered text: `TooFewBytes`.
215/// * Input might be binary, a different encoding or corrupted, `UnexpectedContinuationByte`
216/// and `InterruptedSequence`.
217/// (Broken UTF-8 sequence).
218/// * Less likely to happen accidentaly and might be malicious:
219/// `OverlongEncoding`, `Utf16ReservedCodepoint` and `TooHighCodepoint`.
220/// Note that theese can still be caused by certain valid latin-1 strings
221/// such as `"Á©"` (`b"\xC1\xA9"`).
222#[derive(Clone,Copy, Debug, PartialEq,Eq)]
223pub enum Utf8ErrorKind {
224 /// There are too few bytes to decode the codepoint.
225 ///
226 /// This can happen when a slice is empty or too short, or an iterator
227 /// returned `None` while in the middle of a codepoint.
228 /// This error is never produced by functions accepting fixed-size
229 /// `[u8; 4]` arrays.
230 ///
231 /// If decoding text coming chunked (such as in buffers passed to `Read`),
232 /// the remaing bytes should be carried over into the next chunk or buffer.
233 /// (including the byte this error was produced for.)
234 TooFewBytes,
235 /// A byte which is never used by well-formed UTF-8 was encountered.
236 ///
237 /// This means that the input is using a different encoding,
238 /// is corrupted or binary.
239 ///
240 /// This error is returned when a byte in the following ranges
241 /// is encountered anywhere in an UTF-8 sequence:
242 ///
243 /// * `192` and `193` (`0b1100_000x`): Indicates an overlong encoding
244 /// of a single-byte, ASCII, character, and should therefore never occur.
245 /// * `248..` (`0b1111_1xxx`): Sequences cannot be longer than 4 bytes.
246 /// * `245..=247` (`0b1111_0101 | 0b1111_0110`): Indicates a too high
247 /// codepoint. (above `\u10ffff`)
248 NonUtf8Byte,
249 /// The first byte is not a valid start of a codepoint.
250 ///
251 /// This might happen as a result of slicing into the middle of a codepoint,
252 /// the input not being UTF-8 encoded or being corrupted.
253 /// Errors of this type coming right after another error should probably
254 /// be ignored, unless returned more than three times in a row.
255 ///
256 /// This error is returned when the first byte has a value in the range
257 /// `128..=191` (`0b1000_0000..=0b1011_1111`).
258 UnexpectedContinuationByte,
259 /// The byte at index 1..=3 should be a continuation byte,
260 /// but doesn't fit the pattern `0b10xx_xxxx`.
261 ///
262 /// When the input slice or iterator has too few bytes,
263 /// [`TooFewBytes`](#Incomplete) is returned instead.
264 InterruptedSequence,
265 /// The encoding of the codepoint has so many leading zeroes that it
266 /// could be a byte shorter.
267 ///
268 /// [Successfully decoding this can present a security issue](https://tools.ietf.org/html/rfc3629#section-10):
269 /// Doing so could allow an attacker to circumvent input validation that
270 /// only checks for ASCII characters, and input characters or strings that
271 /// would otherwise be rejected, such as `/../`.
272 ///
273 /// This error is only returned for 3 and 4-byte encodings;
274 /// `NonUtf8Byte` is returned for bytes that start longer or shorter
275 /// overlong encodings.
276 OverlongEncoding,
277 /// The codepoint is reserved for UTF-16 surrogate pairs.
278 ///
279 /// (`Utf8Char` cannot be used to work with the
280 /// [WTF-8](https://simonsapin.github.io/wtf-8) encoding for UCS-2 strings.)
281 ///
282 /// This error is returned for codepoints in the range `\ud800`..=`\udfff`.
283 /// (which are three bytes long as UTF-8)
284 Utf16ReservedCodepoint,
285 /// The codepoint is higher than `\u10ffff`, which is the highest codepoint
286 /// unicode permits.
287 TooHighCodepoint,
288}
289const fn utf8_error_description(kind: Utf8ErrorKind) -> &'static str {
290 match kind {
291 Utf8ErrorKind::TooFewBytes => "too few bytes",
292 Utf8ErrorKind::NonUtf8Byte => "not UTF-8",
293 Utf8ErrorKind::UnexpectedContinuationByte => "not UTF-8",
294 Utf8ErrorKind::InterruptedSequence => "not UTF-8",
295 Utf8ErrorKind::OverlongEncoding => "malformed input",
296 Utf8ErrorKind::Utf16ReservedCodepoint => "malformed input",
297 Utf8ErrorKind::TooHighCodepoint => "invalid character",
298 }
299}
300impl PartialEq<Utf8ErrorKind> for Utf8Error {
301 fn eq(&self, kind: &Utf8ErrorKind) -> bool {
302 self.kind == *kind
303 }
304}
305impl PartialEq<Utf8Error> for Utf8ErrorKind {
306 fn eq(&self, error: &Utf8Error) -> bool {
307 *self == error.kind
308 }
309}