// Copyright Mozilla Foundation. See the COPYRIGHT
// file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
//! `charset` is a wrapper around [`encoding_rs`][1] that provides
//! (non-streaming) decoding for character encodings that occur in _email_ by
//! providing decoding for [UTF-7][2] in addition to the encodings defined by
//! the [Encoding Standard][3] (and provided by `encoding_rs`).
//!
//! _Note:_ Do _not_ use this crate for consuming _Web_ content. For security
//! reasons, consumers of Web content are [_prohibited_][4] from supporting
//! UTF-7. Use `encoding_rs` directly when consuming Web content.
//!
//! The set of encodings consisting of UTF-7 and the encodings defined in the
//! Encoding Standard is believed to be appropriate for consuming email,
//! because that's the set of encodings supported by [Thunderbird][5].
//! Furthermore, UTF-7 support is believed to be necessary based on the
//! experience of the Firefox OS email client. In fact, while the UTF-7
//! implementation in this crate is independent of Thunderbird's UTF-7
//! implementation, Thunderbird uses `encoding_rs` to decode the other
//! encodings. In addition to the labels defined in the Encoding Standard,
//! this crate recognizes additional `java.io` and `java.nio` names for
//! compatibility with JavaMail. For UTF-7, IANA and Netscape 4.0 labels
//! are recognized.
//!
//! Known compatibility limitations (known from Thunderbird bug reports):
//!
//! * Some ancient Usenet posting in Chinese may not be decodable, because
//! this crate does not support HZ.
//! * Some emails sent in Chinese by Sun's email client for CDE on Solaris
//! around the turn of the millennium may not decodable, because this
//! crate does not support ISO-2022-CN.
//! * Some emails sent in Korean by IBM/Lotus Notes may not be decodable,
//! because this crate does not support ISO-2022-KR.
//!
//! This crate intentionally does not support encoding content into legacy
//! encodings. When sending email, _always_ use UTF-8. This is, just call
//! `.as_bytes()` on `&str` and label the content as `UTF-8`.
//!
//! [1]: https://crates.io/crates/encoding_rs/
//! [2]: https://tools.ietf.org/html/rfc2152
//! [3]: https://encoding.spec.whatwg.org/
//! [4]: https://html.spec.whatwg.org/#character-encodings
//! [5]: https://thunderbird.net/
//!
//! # Security considerations
//!
//! Again, this crate is for _email_. Please do _NOT_ use it for _Web_
//! content.
//!
//! Never try to perform any security analysis on the undecoded data in
//! ASCII-incompatible encodings and in UTF-7 in particular. Always decode
//! first and analyze after. UTF-7 allows even characters that don't have to
//! be represeted as base64 to be represented as base64. Also, for consistency
//! with Thunderbird, the UTF-7 decoder in this crate allows e.g. ASCII
//! controls to be represented without base64 encoding even when the spec
//! says they should be base64-encoded.
//!
//! This implementation is non-constant-time by design. An attacker who
//! can observe input length and the time it takes to decode it can make
//! guesses about relative proportions of characters from different ranges.
//! Guessing the proportion of ASCII vs. non-ASCII should be particularly
//! feasible.
#![no_std]
#[cfg_attr(feature = "serde", macro_use)]
extern crate alloc;
extern crate base64;
extern crate encoding_rs;
#[cfg(feature = "serde")]
extern crate serde;
#[cfg(all(test, feature = "serde"))]
extern crate bincode;
#[cfg(all(test, feature = "serde"))]
#[macro_use]
extern crate serde_derive;
#[cfg(all(test, feature = "serde"))]
extern crate serde_json;
use base64::engine::general_purpose::STANDARD_NO_PAD;
use base64::Engine;
use encoding_rs::CoderResult;
use encoding_rs::Encoding;
use encoding_rs::GB18030;
use encoding_rs::GBK;
use encoding_rs::UTF_16BE;
use alloc::borrow::Cow;
use alloc::string::String;
use alloc::vec::Vec;
use core::cmp::Ordering;
#[cfg(feature = "serde")]
use serde::de::Visitor;
#[cfg(feature = "serde")]
use serde::{Deserialize, Deserializer, Serialize, Serializer};
/// The UTF-7 encoding.
pub const UTF_7: Charset = Charset {
variant: VariantCharset::Utf7,
};
/// Converts bytes whose unsigned value is interpreted as Unicode code point
/// (i.e. U+0000 to U+00FF, inclusive) to UTF-8.
///
/// This is useful for decoding non-conforming header names such that the
/// names stay unique and the decoding cannot fail (except for allocation
/// failure).
///
/// Borrows if input is ASCII-only. Performs a single heap allocation
/// otherwise.
pub fn decode_latin1<'a>(bytes: &'a [u8]) -> Cow<'a, str> {
encoding_rs::mem::decode_latin1(bytes)
}
/// Converts ASCII to UTF-8 with non-ASCII bytes replaced with the
/// REPLACEMENT CHARACTER.
///
/// This is can be used for strict MIME compliance when there is no declared
/// encoding.
///
/// Borrows if input is ASCII-only. Performs a single heap allocation
/// otherwise.
pub fn decode_ascii<'a>(bytes: &'a [u8]) -> Cow<'a, str> {
let up_to = Encoding::ascii_valid_up_to(bytes);
// >= makes later things optimize better than ==
if up_to >= bytes.len() {
debug_assert_eq!(up_to, bytes.len());
let s: &str = unsafe { ::core::str::from_utf8_unchecked(bytes) };
return Cow::Borrowed(s);
}
let (head, tail) = bytes.split_at(up_to);
let capacity = head.len() + tail.len() * 3;
let mut vec = Vec::with_capacity(capacity);
vec.extend_from_slice(head);
for &b in tail.into_iter() {
if b < 0x80 {
vec.push(b);
} else {
vec.extend_from_slice("\u{FFFD}".as_bytes());
}
}
Cow::Owned(unsafe { String::from_utf8_unchecked(vec) })
}
/// A character encoding suitable for decoding _email_.
///
/// This is either an encoding as defined in the [Encoding Standard][1]
/// or UTF-7 as defined in [RFC 2152][2].
///
/// [1]: https://encoding.spec.whatwg.org/
/// [2]: https://tools.ietf.org/html/rfc2152
///
/// Each `Charset` has one or more _labels_ that are used to identify
/// the `Charset` in protocol text. In MIME/IANA terminology, these are
/// called _names_ and _aliases_, but for consistency with the Encoding
/// Standard and the encoding_rs crate, they are called labels in this
/// crate. What this crate calls the _name_ (again, for consistency
/// with the Encoding Standard and the encoding_rs crate) is known as
/// _preferred name_ in MIME/IANA terminology.
///
/// Instances of `Charset` can be compared with `==`. `Charset` is
/// `Copy` and is meant to be passed by value.
///
/// _Note:_ It is wrong to use this for decoding Web content. Use
/// `encoding_rs::Encoding` instead!
#[derive(PartialEq, Debug, Copy, Clone, Hash)]
pub struct Charset {
variant: VariantCharset,
}
impl Charset {
/// Implements the
/// [_get an encoding_](https://encoding.spec.whatwg.org/#concept-encoding-get)
/// algorithm with the label "UTF-7" added to the set of labels recognized.
/// GBK is unified with gb18030, since they decode the same and `Charset`
/// only supports decoding.
///
/// If, after ASCII-lowercasing and removing leading and trailing
/// whitespace, the argument matches a label defined in the Encoding
/// Standard or "utf-7", `Some(Charset)` representing the corresponding
/// encoding is returned. If there is no match, `None` is returned.
///
/// This is the right method to use if the action upon the method returning
/// `None` is to use a fallback encoding (e.g. `WINDOWS_1252`) instead.
/// When the action upon the method returning `None` is not to proceed with
/// a fallback but to refuse processing, `for_label_no_replacement()` is more
/// appropriate.
///
/// The argument is of type `&[u8]` instead of `&str` to save callers
/// that are extracting the label from a non-UTF-8 protocol the trouble
/// of conversion to UTF-8. (If you have a `&str`, just call `.as_bytes()`
/// on it.)
#[inline]
pub fn for_label(label: &[u8]) -> Option<Charset> {
if let Some(encoding) = Encoding::for_label(label) {
Some(Charset::for_encoding(encoding))
} else if let Some(variant_charset) = for_label_extended(label) {
Some(Charset {
variant: variant_charset,
})
} else {
None
}
}
/// This method behaves the same as `for_label()`, except when `for_label()`
/// would return `Some(Charset::for_encoding(encoding_rs::REPLACEMENT))`,
/// this method returns `None` instead.
///
/// This method is useful in scenarios where a fatal error is required
/// upon invalid label, because in those cases the caller typically wishes
/// to treat the labels that map to the replacement encoding as fatal
/// errors, too.
///
/// It is not OK to use this method when the action upon the method returning
/// `None` is to use a fallback encoding (e.g. `WINDOWS_1252`) with `text/html`
/// email. In such a case, the `for_label()` method should be used instead in
/// order to avoid unsafe fallback for labels that `for_label()` maps to
/// `Some(REPLACEMENT)`. Such fallback might be safe, though not particularly
/// useful for `text/plain` email, though.
#[inline]
pub fn for_label_no_replacement(label: &[u8]) -> Option<Charset> {
if let Some(encoding) = Encoding::for_label_no_replacement(label) {
Some(Charset::for_encoding(encoding))
} else if let Some(variant_charset) = for_label_extended(label) {
Some(Charset {
variant: variant_charset,
})
} else {
None
}
}
/// Returns the `Charset` corresponding to an `&'static Encoding`.
///
/// `GBK` is unified with `GB18030`, since those two decode the same
/// and `Charset` only supports decoding.
#[inline]
pub fn for_encoding(encoding: &'static Encoding) -> Charset {
let enc = if encoding == GBK { GB18030 } else { encoding };
Charset {
variant: VariantCharset::Encoding(enc),
}
}
/// Performs non-incremental BOM sniffing.
///
/// The argument must either be a buffer representing the entire input
/// stream (non-streaming case) or a buffer representing at least the first
/// three bytes of the input stream (streaming case).
///
/// Returns `Some((Charset::for_encoding(encoding_rs::UTF_8), 3))`,
/// `Some((Charset::for_encoding(encoding_rs::UTF_16LE), 2))` or
/// `Some((Charset::for_encoding(encoding_rs::UTF_16BE), 2))` if the
/// argument starts with the UTF-8, UTF-16LE or UTF-16BE BOM or `None`
/// otherwise.
#[inline]
pub fn for_bom(buffer: &[u8]) -> Option<(Charset, usize)> {
if let Some((encoding, length)) = Encoding::for_bom(buffer) {
Some((Charset::for_encoding(encoding), length))
} else {
None
}
}
/// Returns the name of this encoding.
///
/// Mostly useful for debugging
pub fn name(self) -> &'static str {
match self.variant {
VariantCharset::Encoding(encoding) => encoding.name(),
VariantCharset::Utf7 => "UTF-7",
}
}
/// Checks whether the bytes 0x00...0x7F map exclusively to the characters
/// U+0000...U+007F and vice versa.
#[inline]
pub fn is_ascii_compatible(self) -> bool {
match self.variant {
VariantCharset::Encoding(encoding) => encoding.is_ascii_compatible(),
VariantCharset::Utf7 => false,
}
}
/// Decode complete input to `Cow<'a, str>` _with BOM sniffing_ and with
/// malformed sequences replaced with the REPLACEMENT CHARACTER when the
/// entire input is available as a single buffer (i.e. the end of the
/// buffer marks the end of the stream).
///
/// This method implements the (non-streaming version of) the
/// [_decode_](https://encoding.spec.whatwg.org/#decode) spec concept.
///
/// The second item in the returned tuple is the encoding that was actually
/// used (which may differ from this encoding thanks to BOM sniffing).
///
/// The third item in the returned tuple indicates whether there were
/// malformed sequences (that were replaced with the REPLACEMENT CHARACTER).
///
/// _Note:_ It is wrong to use this when the input buffer represents only
/// a segment of the input instead of the whole input.
///
/// # Panics
///
/// If the size calculation for a heap-allocated backing buffer overflows
/// `usize`.
#[inline]
pub fn decode<'a>(self, bytes: &'a [u8]) -> (Cow<'a, str>, Charset, bool) {
let (charset, without_bom) = match Charset::for_bom(bytes) {
Some((charset, bom_length)) => (charset, &bytes[bom_length..]),
None => (self, bytes),
};
let (cow, had_errors) = charset.decode_without_bom_handling(without_bom);
(cow, charset, had_errors)
}
/// Decode complete input to `Cow<'a, str>` _with BOM removal_ and with
/// malformed sequences replaced with the REPLACEMENT CHARACTER when the
/// entire input is available as a single buffer (i.e. the end of the
/// buffer marks the end of the stream).
///
/// When invoked on `UTF_8`, this method implements the (non-streaming
/// version of) the
/// [_UTF-8 decode_](https://encoding.spec.whatwg.org/#utf-8-decode) spec
/// concept.
///
/// The second item in the returned pair indicates whether there were
/// malformed sequences (that were replaced with the REPLACEMENT CHARACTER).
///
/// _Note:_ It is wrong to use this when the input buffer represents only
/// a segment of the input instead of the whole input.
///
/// # Panics
///
/// If the size calculation for a heap-allocated backing buffer overflows
/// `usize`.
#[inline]
pub fn decode_with_bom_removal<'a>(self, bytes: &'a [u8]) -> (Cow<'a, str>, bool) {
match self.variant {
VariantCharset::Encoding(encoding) => encoding.decode_with_bom_removal(bytes),
VariantCharset::Utf7 => decode_utf7(bytes),
}
}
/// Decode complete input to `Cow<'a, str>` _without BOM handling_ and
/// with malformed sequences replaced with the REPLACEMENT CHARACTER when
/// the entire input is available as a single buffer (i.e. the end of the
/// buffer marks the end of the stream).
///
/// When invoked on `UTF_8`, this method implements the (non-streaming
/// version of) the
/// [_UTF-8 decode without BOM_](https://encoding.spec.whatwg.org/#utf-8-decode-without-bom)
/// spec concept.
///
/// The second item in the returned pair indicates whether there were
/// malformed sequences (that were replaced with the REPLACEMENT CHARACTER).
///
/// _Note:_ It is wrong to use this when the input buffer represents only
/// a segment of the input instead of the whole input.
///
/// # Panics
///
/// If the size calculation for a heap-allocated backing buffer overflows
/// `usize`.
#[inline]
pub fn decode_without_bom_handling<'a>(self, bytes: &'a [u8]) -> (Cow<'a, str>, bool) {
match self.variant {
VariantCharset::Encoding(encoding) => encoding.decode_without_bom_handling(bytes),
VariantCharset::Utf7 => decode_utf7(bytes),
}
}
}
impl From<&'static Encoding> for Charset {
fn from(encoding: &'static Encoding) -> Self {
Charset::for_encoding(encoding)
}
}
#[cfg(feature = "serde")]
impl Serialize for Charset {
#[inline]
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
{
serializer.serialize_str(self.name())
}
}
#[cfg(feature = "serde")]
struct CharsetVisitor;
#[cfg(feature = "serde")]
impl<'de> Visitor<'de> for CharsetVisitor {
type Value = Charset;
fn expecting(&self, formatter: &mut core::fmt::Formatter) -> core::fmt::Result {
formatter.write_str("a valid charset label")
}
fn visit_str<E>(self, value: &str) -> Result<Charset, E>
where
E: serde::de::Error,
{
if let Some(charset) = Charset::for_label(value.as_bytes()) {
Ok(charset)
} else {
Err(E::custom(format!("invalid charset label: {}", value)))
}
}
}
#[cfg(feature = "serde")]
impl<'de> Deserialize<'de> for Charset {
fn deserialize<D>(deserializer: D) -> Result<Charset, D::Error>
where
D: Deserializer<'de>,
{
deserializer.deserialize_str(CharsetVisitor)
}
}
static LABELS_SORTED: [&'static str; 29] = [
"ms950",
"ms874",
"ms936",
"utf-7",
"ms949",
"tis620",
"euc_cn",
"euc_jp",
"koi8_r",
"euc_kr",
"koi8_u",
"iso8859_1",
"iso8859_2",
"iso8859_3",
"iso8859_4",
"iso8859_5",
"iso8859_6",
"iso8859_7",
"iso8859_9",
"iso2022jp",
"iso8859_13",
"iso8859_15",
"ms950_hkscs",
"x-windows-950",
"x-windows-874",
"x-windows-949",
"csunicode11utf7",
"unicode-1-1-utf-7",
"x-unicode-2-0-utf-7",
];
static ENCODINGS_IN_LABEL_SORT: [VariantCharset; 29] = [
VariantCharset::Encoding(&encoding_rs::BIG5_INIT),
VariantCharset::Encoding(&encoding_rs::WINDOWS_874_INIT),
VariantCharset::Encoding(&encoding_rs::GB18030_INIT),
VariantCharset::Utf7,
VariantCharset::Encoding(&encoding_rs::EUC_KR_INIT),
VariantCharset::Encoding(&encoding_rs::WINDOWS_874_INIT),
VariantCharset::Encoding(&encoding_rs::GB18030_INIT),
VariantCharset::Encoding(&encoding_rs::EUC_JP_INIT),
VariantCharset::Encoding(&encoding_rs::KOI8_R_INIT),
VariantCharset::Encoding(&encoding_rs::EUC_KR_INIT),
VariantCharset::Encoding(&encoding_rs::KOI8_U_INIT),
VariantCharset::Encoding(&encoding_rs::WINDOWS_1252_INIT),
VariantCharset::Encoding(&encoding_rs::ISO_8859_2_INIT),
VariantCharset::Encoding(&encoding_rs::ISO_8859_3_INIT),
VariantCharset::Encoding(&encoding_rs::ISO_8859_4_INIT),
VariantCharset::Encoding(&encoding_rs::ISO_8859_5_INIT),
VariantCharset::Encoding(&encoding_rs::ISO_8859_6_INIT),
VariantCharset::Encoding(&encoding_rs::ISO_8859_7_INIT),
VariantCharset::Encoding(&encoding_rs::WINDOWS_1254_INIT),
VariantCharset::Encoding(&encoding_rs::ISO_2022_JP_INIT),
VariantCharset::Encoding(&encoding_rs::ISO_8859_13_INIT),
VariantCharset::Encoding(&encoding_rs::ISO_8859_15_INIT),
VariantCharset::Encoding(&encoding_rs::BIG5_INIT),
VariantCharset::Encoding(&encoding_rs::BIG5_INIT),
VariantCharset::Encoding(&encoding_rs::WINDOWS_874_INIT),
VariantCharset::Encoding(&encoding_rs::EUC_KR_INIT),
VariantCharset::Utf7,
VariantCharset::Utf7,
VariantCharset::Utf7,
];
const LONGEST_LABEL_LENGTH: usize = 19; // x-unicode-2-0-utf-7
/// Copypaste from encoding_rs to search over the labels known to this
/// crate but not encoding_rs.
#[inline(never)]
fn for_label_extended(label: &[u8]) -> Option<VariantCharset> {
let mut trimmed = [0u8; LONGEST_LABEL_LENGTH];
let mut trimmed_pos = 0usize;
let mut iter = label.into_iter();
// before
loop {
match iter.next() {
None => {
return None;
}
Some(byte) => {
// The characters used in labels are:
// a-z (except q, but excluding it below seems excessive)
// 0-9
// . _ - :
match *byte {
0x09u8 | 0x0Au8 | 0x0Cu8 | 0x0Du8 | 0x20u8 => {
continue;
}
b'A'..=b'Z' => {
trimmed[trimmed_pos] = *byte + 0x20u8;
trimmed_pos = 1usize;
break;
}
b'a'..=b'z' | b'0'..=b'9' | b'-' | b'_' | b':' | b'.' => {
trimmed[trimmed_pos] = *byte;
trimmed_pos = 1usize;
break;
}
_ => {
return None;
}
}
}
}
}
// inside
loop {
match iter.next() {
None => {
break;
}
Some(byte) => {
match *byte {
0x09u8 | 0x0Au8 | 0x0Cu8 | 0x0Du8 | 0x20u8 => {
break;
}
b'A'..=b'Z' => {
if trimmed_pos == LONGEST_LABEL_LENGTH {
// There's no encoding with a label this long
return None;
}
trimmed[trimmed_pos] = *byte + 0x20u8;
trimmed_pos += 1usize;
continue;
}
b'a'..=b'z' | b'0'..=b'9' | b'-' | b'_' | b':' | b'.' => {
if trimmed_pos == LONGEST_LABEL_LENGTH {
// There's no encoding with a label this long
return None;
}
trimmed[trimmed_pos] = *byte;
trimmed_pos += 1usize;
continue;
}
_ => {
return None;
}
}
}
}
}
// after
loop {
match iter.next() {
None => {
break;
}
Some(byte) => {
match *byte {
0x09u8 | 0x0Au8 | 0x0Cu8 | 0x0Du8 | 0x20u8 => {
continue;
}
_ => {
// There's no label with space in the middle
return None;
}
}
}
}
}
let candidate = &trimmed[..trimmed_pos];
match LABELS_SORTED.binary_search_by(|probe| {
let bytes = probe.as_bytes();
let c = bytes.len().cmp(&candidate.len());
if c != Ordering::Equal {
return c;
}
let probe_iter = bytes.iter().rev();
let candidate_iter = candidate.iter().rev();
probe_iter.cmp(candidate_iter)
}) {
Ok(i) => Some(ENCODINGS_IN_LABEL_SORT[i]),
Err(_) => None,
}
}
#[inline]
fn utf7_ascii_up_to(bytes: &[u8]) -> usize {
for (i, &byte) in bytes.into_iter().enumerate() {
if byte == b'+' || byte >= 0x80 {
return i;
}
}
bytes.len()
}
#[inline]
fn utf7_base64_up_to(bytes: &[u8]) -> usize {
for (i, &byte) in bytes.into_iter().enumerate() {
match byte {
b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9' | b'+' | b'/' => {}
_ => {
return i;
}
}
}
bytes.len()
}
#[inline]
fn utf7_base64_decode(bytes: &[u8], string: &mut String) -> bool {
// The intermediate buffer should be long enough to fit a line
// of 80 base64 bytes and should also be a multiple of 3. This
// way, normal email lines will be handled in one go, but
// longer sequences won't get split between base64 groups of
// 4 input / 3 output bytes.
let mut decoder = UTF_16BE.new_decoder_without_bom_handling();
let mut buf = [0u8; 60];
let mut tail = bytes;
let mut had_errors = false;
let mut trailing_error = false;
loop {
let (last, mut cap) = if tail.len() <= 80 {
(true, tail.len())
} else {
(false, 80)
};
let len;
loop {
match STANDARD_NO_PAD.decode_slice(&tail[..cap], &mut buf[..]) {
Ok(l) => {
len = l;
break;
}
Err(_) => {
assert!(last);
had_errors = true;
trailing_error = true;
tail = &tail[..tail.len() - 1];
cap -= 1;
}
}
}
let mut total_read = 0;
loop {
let (result, read, err) = decoder.decode_to_string(&buf[total_read..len], string, last);
total_read += read;
had_errors |= err;
match result {
CoderResult::InputEmpty => {
if last {
if trailing_error {
string.push_str("\u{FFFD}");
}
return had_errors;
}
break;
}
CoderResult::OutputFull => {
let left = len - total_read;
let needed = decoder.max_utf8_buffer_length(left).unwrap();
string.reserve(needed);
}
}
}
tail = &tail[80..];
}
}
#[inline(never)]
fn decode_utf7<'a>(bytes: &'a [u8]) -> (Cow<'a, str>, bool) {
let up_to = utf7_ascii_up_to(bytes);
if up_to == bytes.len() {
let s: &str = unsafe { core::str::from_utf8_unchecked(bytes) };
return (Cow::Borrowed(s), false);
}
let mut had_errors = false;
let mut out = String::with_capacity(bytes.len());
out.push_str(unsafe { core::str::from_utf8_unchecked(&bytes[..up_to]) });
let mut tail = &bytes[up_to..];
loop {
// `tail[0]` is now either a plus sign or non-ASCII
let first = tail[0];
tail = &tail[1..];
if first == b'+' {
let up_to = utf7_base64_up_to(tail);
had_errors |= utf7_base64_decode(&tail[..up_to], &mut out);
if up_to == tail.len() {
if up_to == 0 {
// Plus sign didn't start a base64 run and also
// wasn't followed by a minus.
had_errors = true;
out.push_str("\u{FFFD}");
}
return (Cow::Owned(out), had_errors);
}
if up_to == 0 {
if tail[up_to] == b'-' {
// There was no base64 data between
// plus and minus, so we had the sequence
// meaning the plus sign itself.
out.push_str("+");
tail = &tail[up_to + 1..];
} else {
// Plus sign didn't start a base64 run and also
// wasn't followed by a minus.
had_errors = true;
out.push_str("\u{FFFD}");
}
} else if tail[up_to] == b'-' {
tail = &tail[up_to + 1..];
} else {
tail = &tail[up_to..];
}
} else {
had_errors = true;
out.push_str("\u{FFFD}");
}
let up_to = utf7_ascii_up_to(tail);
out.push_str(unsafe { core::str::from_utf8_unchecked(&tail[..up_to]) });
if up_to == tail.len() {
return (Cow::Owned(out), had_errors);
}
tail = &tail[up_to..];
}
}
#[derive(PartialEq, Debug, Copy, Clone, Hash)]
enum VariantCharset {
Utf7,
Encoding(&'static Encoding),
}
#[cfg(all(test, feature = "serde"))]
#[derive(Serialize, Deserialize, Debug, PartialEq)]
struct Demo {
num: u32,
name: String,
charset: Charset,
}
#[cfg(test)]
mod tests {
use super::*;
fn utf7_no_err(bytes: &[u8]) -> String {
let (cow, had_errors) = UTF_7.decode_without_bom_handling(bytes);
assert!(!had_errors);
cow.into()
}
fn utf7_err(bytes: &[u8]) -> String {
let (cow, had_errors) = UTF_7.decode_without_bom_handling(bytes);
assert!(had_errors);
cow.into()
}
// Any copyright to the test code below this comment is dedicated to the
// Public Domain. https://creativecommons.org/publicdomain/zero/1.0/
#[test]
fn test_for_label() {
assert_eq!(Charset::for_label(b" uTf-7\t "), Some(UTF_7));
assert_eq!(
Charset::for_label(b" uTf-8\t "),
Some(Charset::for_encoding(encoding_rs::UTF_8))
);
assert_eq!(
Charset::for_label(b" iSo-8859-1\t "),
Some(Charset::for_encoding(encoding_rs::WINDOWS_1252))
);
assert_eq!(
Charset::for_label(b" gb2312\t "),
Some(Charset::for_encoding(encoding_rs::GB18030))
);
assert_eq!(
Charset::for_label(b" ISO-2022-KR\t "),
Some(Charset::for_encoding(encoding_rs::REPLACEMENT))
);
assert_eq!(Charset::for_label(b"u"), None);
assert_eq!(Charset::for_label(b"ut"), None);
assert_eq!(Charset::for_label(b"utf"), None);
assert_eq!(Charset::for_label(b"utf-"), None);
}
#[test]
fn test_for_label_no_replacement() {
assert_eq!(
Charset::for_label_no_replacement(b" uTf-7\t "),
Some(UTF_7)
);
assert_eq!(
Charset::for_label_no_replacement(b" uTf-8\t "),
Some(Charset::for_encoding(encoding_rs::UTF_8))
);
assert_eq!(
Charset::for_label_no_replacement(b" iSo-8859-1\t "),
Some(Charset::for_encoding(encoding_rs::WINDOWS_1252))
);
assert_eq!(
Charset::for_label_no_replacement(b" Gb2312\t "),
Some(Charset::for_encoding(encoding_rs::GB18030))
);
assert_eq!(Charset::for_label_no_replacement(b" ISO-2022-KR\t "), None);
assert_eq!(Charset::for_label_no_replacement(b"u"), None);
assert_eq!(Charset::for_label_no_replacement(b"ut"), None);
assert_eq!(Charset::for_label_no_replacement(b"utf"), None);
assert_eq!(Charset::for_label_no_replacement(b"utf-"), None);
}
#[test]
fn test_for_label_and_name() {
assert_eq!(Charset::for_label(b" uTf-7\t ").unwrap().name(), "UTF-7");
assert_eq!(Charset::for_label(b" uTf-8\t ").unwrap().name(), "UTF-8");
assert_eq!(
Charset::for_label(b" Gb2312\t ").unwrap().name(),
"gb18030"
);
}
#[test]
fn test_extended_labels() {
let cases: [(&'static str, VariantCharset); 29] = [
(
"iso8859_1",
VariantCharset::Encoding(&encoding_rs::WINDOWS_1252_INIT),
),
(
"iso8859_2",
VariantCharset::Encoding(&encoding_rs::ISO_8859_2_INIT),
),
(
"iso8859_3",
VariantCharset::Encoding(&encoding_rs::ISO_8859_3_INIT),
),
(
"iso8859_4",
VariantCharset::Encoding(&encoding_rs::ISO_8859_4_INIT),
),
(
"iso8859_5",
VariantCharset::Encoding(&encoding_rs::ISO_8859_5_INIT),
),
(
"iso8859_6",
VariantCharset::Encoding(&encoding_rs::ISO_8859_6_INIT),
),
(
"iso8859_7",
VariantCharset::Encoding(&encoding_rs::ISO_8859_7_INIT),
),
(
"iso8859_9",
VariantCharset::Encoding(&encoding_rs::WINDOWS_1254_INIT),
),
(
"iso8859_13",
VariantCharset::Encoding(&encoding_rs::ISO_8859_13_INIT),
),
(
"iso8859_15",
VariantCharset::Encoding(&encoding_rs::ISO_8859_15_INIT),
),
(
"ms936",
VariantCharset::Encoding(&encoding_rs::GB18030_INIT),
),
("ms949", VariantCharset::Encoding(&encoding_rs::EUC_KR_INIT)),
("ms950", VariantCharset::Encoding(&encoding_rs::BIG5_INIT)),
(
"ms950_hkscs",
VariantCharset::Encoding(&encoding_rs::BIG5_INIT),
),
(
"ms874",
VariantCharset::Encoding(&encoding_rs::WINDOWS_874_INIT),
),
(
"euc_jp",
VariantCharset::Encoding(&encoding_rs::EUC_JP_INIT),
),
(
"euc_kr",
VariantCharset::Encoding(&encoding_rs::EUC_KR_INIT),
),
(
"euc_cn",
VariantCharset::Encoding(&encoding_rs::GB18030_INIT),
),
(
"koi8_r",
VariantCharset::Encoding(&encoding_rs::KOI8_R_INIT),
),
(
"koi8_u",
VariantCharset::Encoding(&encoding_rs::KOI8_U_INIT),
),
(
"x-windows-874",
VariantCharset::Encoding(&encoding_rs::WINDOWS_874_INIT),
),
(
"x-windows-949",
VariantCharset::Encoding(&encoding_rs::EUC_KR_INIT),
),
(
"x-windows-950",
VariantCharset::Encoding(&encoding_rs::BIG5_INIT),
),
(
"tis620",
VariantCharset::Encoding(&encoding_rs::WINDOWS_874_INIT),
),
(
"iso2022jp",
VariantCharset::Encoding(&encoding_rs::ISO_2022_JP_INIT),
),
("x-unicode-2-0-utf-7", VariantCharset::Utf7), // Netscape 4.0 per https://jkorpela.fi/chars.html
("unicode-1-1-utf-7", VariantCharset::Utf7), // https://www.iana.org/assignments/character-sets/character-sets.xhtml
("csunicode11utf7", VariantCharset::Utf7), // https://www.iana.org/assignments/character-sets/character-sets.xhtml
("utf-7", VariantCharset::Utf7),
];
for (label, expected) in cases.iter() {
assert_eq!(
Charset::for_label(label.as_bytes()),
Some(Charset { variant: *expected })
);
}
}
#[test]
fn test_utf7_decode() {
assert_eq!(utf7_no_err(b""), "");
assert_eq!(utf7_no_err(b"ab"), "ab");
assert_eq!(utf7_no_err(b"+-"), "+");
assert_eq!(utf7_no_err(b"a+-b"), "a+b");
assert_eq!(utf7_no_err(b"+ACs-"), "+");
assert_eq!(utf7_no_err(b"+AGEAKwBi-"), "a+b");
assert_eq!(utf7_no_err(b"+JgM-"), "\u{2603}");
assert_eq!(utf7_no_err(b"+JgM."), "\u{2603}.");
assert_eq!(utf7_no_err(b"+JgM "), "\u{2603} ");
assert_eq!(utf7_no_err(b"+JgM--"), "\u{2603}-");
assert_eq!(utf7_no_err(b"+JgM"), "\u{2603}");
assert_eq!(utf7_no_err(b"+JgMmAw-"), "\u{2603}\u{2603}");
assert_eq!(utf7_no_err(b"+JgMmAw."), "\u{2603}\u{2603}.");
assert_eq!(utf7_no_err(b"+JgMmAw "), "\u{2603}\u{2603} ");
assert_eq!(utf7_no_err(b"+JgMmAw--"), "\u{2603}\u{2603}-");
assert_eq!(utf7_no_err(b"+JgMmAw"), "\u{2603}\u{2603}");
assert_eq!(utf7_no_err(b"+2D3cqQ-"), "\u{1F4A9}");
assert_eq!(utf7_no_err(b"+2D3cqQ."), "\u{1F4A9}.");
assert_eq!(utf7_no_err(b"+2D3cqQ "), "\u{1F4A9} ");
assert_eq!(utf7_no_err(b"+2D3cqQ--"), "\u{1F4A9}-");
assert_eq!(utf7_no_err(b"+2D3cqQ"), "\u{1F4A9}");
assert_eq!(utf7_no_err(b"+JgPYPdyp2D3cqdg93KnYPdyp2D3cqdg93KnYPdyp2D3cqdg93KnYPdyp2D3cqdg93KnYPdyp2D3cqdg93KnYPdyp2D3cqdg93KnYPdyp2D3cqdg93KnYPdyp2D3cqdg93KnYPdyp2D3cqdg93KnYPdyp2D3cqdg93KnYPdyp2D3cqdg93KnYPdyp2D3cqdg93KnYPdyp2D3cqdg93KnYPdyp"), "\u{2603}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}");
assert_eq!(utf7_err(b"+"), "\u{FFFD}");
assert_eq!(utf7_err(b"+J-"), "\u{FFFD}");
assert_eq!(utf7_err(b"+Jg-"), "\u{FFFD}");
assert_eq!(utf7_err(b"+J"), "\u{FFFD}");
assert_eq!(utf7_err(b"+Jg"), "\u{FFFD}");
assert_eq!(utf7_err(b"+."), "\u{FFFD}.");
assert_eq!(utf7_err(b"+J."), "\u{FFFD}.");
assert_eq!(utf7_err(b"+Jg."), "\u{FFFD}.");
assert_eq!(utf7_err(b"+ "), "\u{FFFD} ");
assert_eq!(utf7_err(b"+J "), "\u{FFFD} ");
assert_eq!(utf7_err(b"+Jg "), "\u{FFFD} ");
assert_eq!(utf7_err(b"+JgMmA-"), "\u{2603}\u{FFFD}\u{FFFD}");
assert_eq!(utf7_err(b"+JgMmA"), "\u{2603}\u{FFFD}\u{FFFD}");
assert_eq!(utf7_err(b"+JgMmA."), "\u{2603}\u{FFFD}\u{FFFD}.");
assert_eq!(utf7_err(b"+JgMmA "), "\u{2603}\u{FFFD}\u{FFFD} ");
assert_eq!(utf7_err(b"+JgMm-"), "\u{2603}\u{FFFD}");
assert_eq!(utf7_err(b"+JgMm"), "\u{2603}\u{FFFD}");
assert_eq!(utf7_err(b"+JgMm."), "\u{2603}\u{FFFD}.");
assert_eq!(utf7_err(b"+JgMm "), "\u{2603}\u{FFFD} ");
assert_eq!(utf7_err(b"+2D3cq-"), "\u{FFFD}\u{FFFD}");
assert_eq!(utf7_err(b"+2D3cq"), "\u{FFFD}\u{FFFD}");
assert_eq!(utf7_err(b"+2D3cq."), "\u{FFFD}\u{FFFD}.");
assert_eq!(utf7_err(b"+2D3cq "), "\u{FFFD}\u{FFFD} ");
assert_eq!(utf7_err(b"+2D3c-"), "\u{FFFD}");
assert_eq!(utf7_err(b"+2D3c"), "\u{FFFD}");
assert_eq!(utf7_err(b"+2D3c."), "\u{FFFD}.");
assert_eq!(utf7_err(b"+2D3c "), "\u{FFFD} ");
assert_eq!(utf7_err(b"+2D3-"), "\u{FFFD}");
assert_eq!(utf7_err(b"+2D3"), "\u{FFFD}");
assert_eq!(utf7_err(b"+2D3."), "\u{FFFD}.");
assert_eq!(utf7_err(b"+2D3 "), "\u{FFFD} ");
assert_eq!(utf7_err(b"+2D-"), "\u{FFFD}");
assert_eq!(utf7_err(b"+2D"), "\u{FFFD}");
assert_eq!(utf7_err(b"+2D."), "\u{FFFD}.");
assert_eq!(utf7_err(b"+2D "), "\u{FFFD} ");
assert_eq!(utf7_err(b"+2-"), "\u{FFFD}");
assert_eq!(utf7_err(b"+2"), "\u{FFFD}");
assert_eq!(utf7_err(b"+2."), "\u{FFFD}.");
assert_eq!(utf7_err(b"+2 "), "\u{FFFD} ");
// Lone high surrogate
assert_eq!(utf7_err(b"+2D0-"), "\u{FFFD}");
assert_eq!(utf7_err(b"+2D0"), "\u{FFFD}");
assert_eq!(utf7_err(b"+2D0."), "\u{FFFD}.");
assert_eq!(utf7_err(b"+2D0 "), "\u{FFFD} ");
assert_eq!(utf7_err(b"+2D0AYQ-"), "\u{FFFD}a");
assert_eq!(utf7_err(b"+2D0AYQ"), "\u{FFFD}a");
assert_eq!(utf7_err(b"+2D0AYQ."), "\u{FFFD}a.");
assert_eq!(utf7_err(b"+2D0AYQ "), "\u{FFFD}a ");
assert_eq!(utf7_err(b"+2D3/QQ-"), "\u{FFFD}\u{FF41}");
assert_eq!(utf7_err(b"+2D3/QQ"), "\u{FFFD}\u{FF41}");
assert_eq!(utf7_err(b"+2D3/QQ."), "\u{FFFD}\u{FF41}.");
assert_eq!(utf7_err(b"+2D3/QQ "), "\u{FFFD}\u{FF41} ");
// Lone low surrogate
assert_eq!(utf7_err(b"+AGHcqQ-"), "a\u{FFFD}");
assert_eq!(utf7_err(b"+AGHcqQ"), "a\u{FFFD}");
assert_eq!(utf7_err(b"+AGHcqQ."), "a\u{FFFD}.");
assert_eq!(utf7_err(b"+AGHcqQ "), "a\u{FFFD} ");
}
#[test]
fn test_decode_ascii() {
assert_eq!(decode_ascii(b"aa\x80bb\xFFcc"), "aa\u{FFFD}bb\u{FFFD}cc");
}
#[test]
fn test_from() {
let _: Charset = encoding_rs::UTF_8.into();
}
#[cfg(feature = "serde")]
#[test]
fn test_serde_utf7() {
let demo = Demo {
num: 42,
name: "foo".into(),
charset: UTF_7,
};
let serialized = serde_json::to_string(&demo).unwrap();
let deserialized: Demo = serde_json::from_str(&serialized).unwrap();
assert_eq!(deserialized, demo);
let bincoded = bincode::serialize(&demo).unwrap();
let debincoded: Demo = bincode::deserialize(&bincoded[..]).unwrap();
assert_eq!(debincoded, demo);
}
#[cfg(feature = "serde")]
#[test]
fn test_serde_utf8() {
let demo = Demo {
num: 42,
name: "foo".into(),
charset: encoding_rs::UTF_8.into(),
};
let serialized = serde_json::to_string(&demo).unwrap();
let deserialized: Demo = serde_json::from_str(&serialized).unwrap();
assert_eq!(deserialized, demo);
let bincoded = bincode::serialize(&demo).unwrap();
let debincoded: Demo = bincode::deserialize(&bincoded[..]).unwrap();
assert_eq!(debincoded, demo);
}
}