use crate::Error::FromUtf8Error;
use crate::Result;
#[expect(clippy::cast_possible_truncation)]
pub fn to_bytes(data: &str) -> Result<Vec<u8>> {
let bytes = data.as_bytes();
if bytes.iter().all(|&b| b > 0 && b < 128) {
return Ok(bytes.to_vec());
}
let mut encoded = Vec::with_capacity(data.len() * 2);
for ch in data.chars() {
let code = ch as u32;
match code {
0 => encoded.extend_from_slice(&[0xC0, 0x80]),
1..=0x7F => encoded.push(code as u8),
0x80..=0x7FF => {
encoded.push(0xC0 | ((code >> 6) as u8));
encoded.push(0x80 | ((code & 0x3F) as u8));
}
0x800..=0xFFFF => {
encoded.push(0xE0 | ((code >> 12) as u8));
encoded.push(0x80 | (((code >> 6) & 0x3F) as u8));
encoded.push(0x80 | ((code & 0x3F) as u8));
}
_ => {
let u = code - 0x1_0000;
let high = 0xD800 + (u >> 10);
let low = 0xDC00 + (u & 0x3FF);
encoded.push(0xE0 | ((high >> 12) as u8));
encoded.push(0x80 | (((high >> 6) & 0x3F) as u8));
encoded.push(0x80 | ((high & 0x3F) as u8));
encoded.push(0xE0 | ((low >> 12) as u8));
encoded.push(0x80 | (((low >> 6) & 0x3F) as u8));
encoded.push(0x80 | ((low & 0x3F) as u8));
}
}
}
Ok(encoded)
}
pub fn to_utf16(input: &[u8]) -> Result<Vec<u16>> {
let mut result = Vec::with_capacity(input.len());
let mut iter = input.iter();
while let Some(&byte1) = iter.next() {
match byte1 {
0x01..=0x7F => {
result.push(u16::from(byte1));
}
0x00 => {
result.push(0);
}
0xC0..=0xDF => {
let Some(&byte2) = iter.next() else {
return Err(FromUtf8Error("Invalid MUTF-8 byte sequence".to_string()));
};
let ch = u16::from(byte1 & 0x1F) << 6 | u16::from(byte2 & 0x3F);
result.push(ch);
}
0xE0..=0xEF => {
let Some(&byte2) = iter.next() else {
return Err(FromUtf8Error("Invalid MUTF-8 byte sequence".to_string()));
};
let Some(&byte3) = iter.next() else {
return Err(FromUtf8Error("Invalid MUTF-8 byte sequence".to_string()));
};
let ch = u16::from(byte1 & 0x0F) << 12
| u16::from(byte2 & 0x3F) << 6
| u16::from(byte3 & 0x3F);
result.push(ch);
}
_ => {
return Err(FromUtf8Error(
"MUTF-8 does not use 4-byte sequences".to_string(),
));
}
}
}
Ok(result)
}
pub fn from_bytes(input: &[u8]) -> Result<String> {
if input.is_ascii() {
#[expect(unsafe_code)]
let s = unsafe { std::str::from_utf8_unchecked(input) };
return Ok(s.to_owned());
}
if !has_mutf8_special_sequences(input) {
return std::str::from_utf8(input)
.map(std::borrow::ToOwned::to_owned)
.map_err(|e| FromUtf8Error(e.to_string()));
}
decode_mutf8(input)
}
pub fn from_bytes_cow(input: &[u8]) -> Result<std::borrow::Cow<'_, str>> {
use std::borrow::Cow;
if input.is_ascii() {
#[expect(unsafe_code)]
let s = unsafe { std::str::from_utf8_unchecked(input) };
return Ok(Cow::Borrowed(s));
}
if !has_mutf8_special_sequences(input) {
return std::str::from_utf8(input)
.map(Cow::Borrowed)
.map_err(|e| FromUtf8Error(e.to_string()));
}
decode_mutf8(input).map(Cow::Owned)
}
#[inline]
pub fn validate(input: &[u8]) -> Result<()> {
if input.iter().all(|&b| b.wrapping_sub(1) < 0x7F) {
return Ok(());
}
validate_slow(input)
}
fn validate_slow(input: &[u8]) -> Result<()> {
let mut iter = input.iter();
while let Some(&byte1) = iter.next() {
match byte1 {
0x00 => {
return Err(FromUtf8Error(
"Invalid MUTF-8: bare null byte (U+0000 must be encoded as 0xC0 0x80)"
.to_string(),
));
}
0x01..=0x7F => {}
0xC0..=0xDF => {
let Some(&byte2) = iter.next() else {
return Err(FromUtf8Error(
"Invalid MUTF-8: truncated 2-byte sequence".to_string(),
));
};
if byte2 & 0xC0 != 0x80 {
return Err(FromUtf8Error(
"Invalid MUTF-8: invalid continuation byte".to_string(),
));
}
if byte1 == 0xC0 && byte2 != 0x80 {
return Err(FromUtf8Error(
"Invalid MUTF-8: overlong 2-byte encoding".to_string(),
));
}
if byte1 == 0xC1 {
return Err(FromUtf8Error(
"Invalid MUTF-8: overlong 2-byte encoding".to_string(),
));
}
}
0xE0..=0xEF => {
let Some(&byte2) = iter.next() else {
return Err(FromUtf8Error(
"Invalid MUTF-8: truncated 3-byte sequence".to_string(),
));
};
let Some(&byte3) = iter.next() else {
return Err(FromUtf8Error(
"Invalid MUTF-8: truncated 3-byte sequence".to_string(),
));
};
if byte2 & 0xC0 != 0x80 || byte3 & 0xC0 != 0x80 {
return Err(FromUtf8Error(
"Invalid MUTF-8: invalid continuation byte".to_string(),
));
}
if byte1 == 0xE0 && byte2 < 0xA0 {
return Err(FromUtf8Error(
"Invalid MUTF-8: overlong 3-byte encoding".to_string(),
));
}
}
_ => {
return Err(FromUtf8Error(
"Invalid MUTF-8: invalid lead byte".to_string(),
));
}
}
}
Ok(())
}
#[inline]
#[must_use]
pub fn has_mutf8_specials(input: &[u8]) -> bool {
has_mutf8_special_sequences(input)
}
#[inline]
fn has_mutf8_special_sequences(input: &[u8]) -> bool {
let len = input.len();
let mut i = 0;
while i < len {
let b = input[i];
if b < 0x80 {
i += 1;
} else if b < 0xC0 {
return true;
} else if b < 0xE0 {
let next = if i + 1 < len { input[i + 1] } else { 0 };
if (b == 0xC0 && next == 0x80) || b == 0xC1 {
return true;
}
i += 2;
} else if b < 0xF0 {
let next = if i + 1 < len { input[i + 1] } else { 0 };
if b == 0xED && next >= 0xA0 {
return true;
}
i += 3;
} else {
return true;
}
}
false
}
#[expect(clippy::cast_possible_truncation)]
fn decode_mutf8(input: &[u8]) -> Result<String> {
let mut result = Vec::with_capacity(input.len());
let mut i = 0;
let len = input.len();
while i < len {
let byte1 = input[i];
match byte1 {
0x00..=0x7F => {
result.push(byte1);
i += 1;
}
0xC0..=0xDF => {
if i + 1 >= len {
return Err(FromUtf8Error("Invalid MUTF-8 byte sequence".to_string()));
}
let byte2 = input[i + 1];
if byte1 == 0xC0 && byte2 == 0x80 {
result.push(0);
} else {
result.push(byte1);
result.push(byte2);
}
i += 2;
}
0xE0..=0xEF => {
if i + 2 >= len {
return Err(FromUtf8Error("Invalid MUTF-8 byte sequence".to_string()));
}
let byte2 = input[i + 1];
let byte3 = input[i + 2];
let ch = u32::from(byte1 & 0x0F) << 12
| u32::from(byte2 & 0x3F) << 6
| u32::from(byte3 & 0x3F);
if (0xD800..=0xDFFF).contains(&ch) {
if (0xD800..=0xDBFF).contains(&ch) && i + 5 < len {
let next1 = input[i + 3];
if next1 & 0xF0 == 0xE0 {
let next2 = input[i + 4];
let next3 = input[i + 5];
let low = u32::from(next1 & 0x0F) << 12
| u32::from(next2 & 0x3F) << 6
| u32::from(next3 & 0x3F);
if (0xDC00..=0xDFFF).contains(&low) {
let code = 0x1_0000 + ((ch - 0xD800) << 10) + (low - 0xDC00);
result.push(0xF0 | ((code >> 18) as u8));
result.push(0x80 | (((code >> 12) & 0x3F) as u8));
result.push(0x80 | (((code >> 6) & 0x3F) as u8));
result.push(0x80 | ((code & 0x3F) as u8));
i += 6;
continue;
}
}
}
result.extend_from_slice(&[0xEF, 0xBF, 0xBD]);
} else {
result.push(byte1);
result.push(byte2);
result.push(byte3);
}
i += 3;
}
_ => {
return Err(FromUtf8Error(
"MUTF-8 does not use 4-byte sequences".to_string(),
));
}
}
}
String::from_utf8(result).map_err(|e| FromUtf8Error(e.to_string()))
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_all_utf8_chars() -> Result<()> {
for i in 0..=0x10_FFFF {
if (0xD800..=0xDFFF).contains(&i) {
assert!(char::from_u32(i).is_none());
continue;
}
if let Some(ch) = char::from_u32(i) {
let s = ch.to_string();
let rust_encoded_bytes = s.as_bytes().to_vec();
let mutf8_encoded_bytes = to_bytes(&s)?;
match i {
0 => {
assert_eq!(mutf8_encoded_bytes, vec![0xC0, 0x80]);
}
0x10000..=0x10_FFFF => {
}
_ => {
assert_eq!(rust_encoded_bytes, mutf8_encoded_bytes);
}
}
let rust_encoded_result = String::from_utf8(rust_encoded_bytes)?;
let mutf8_encoded_result = from_bytes(mutf8_encoded_bytes.as_slice())?;
assert_eq!(rust_encoded_result, mutf8_encoded_result);
} else {
assert!((0xD800..=0xDFFF).contains(&i));
}
}
Ok(())
}
#[test]
fn test_utf8_encoding() {
let bytes = &[237, 162, 162];
assert!(from_bytes(bytes).is_ok());
}
#[test]
fn test_to_bytes() -> Result<()> {
let data = "\u{0000}\u{007F}\u{0080}\u{07FF}\u{0800}\u{FFFF}\u{10000}";
let expected = vec![
0xC0, 0x80, 0x7F, 0xC2, 0x80, 0xDF, 0xBF, 0xE0, 0xA0, 0x80, 0xEF, 0xBF, 0xBF, 0xED, 0xA0, 0x80, 0xED, 0xB0, 0x80, ];
assert_eq!(to_bytes(data)?, expected);
Ok(())
}
#[test]
fn test_from_bytes() -> Result<()> {
let bytes = &[
0xC0, 0x80, 0x7F, 0xC2, 0x80, 0xDF, 0xBF, 0xE0, 0xA0, 0x80, 0xEF, 0xBF, 0xBF, 0xED, 0xA0, 0x80, 0xED, 0xB0, 0x80, ];
let expected = "\u{0000}\u{007F}\u{0080}\u{07FF}\u{0800}\u{FFFF}\u{10000}";
let result = from_bytes(bytes)?;
assert_eq!(result, expected);
Ok(())
}
#[test]
fn test_from_bytes_invalid() {
assert!(from_bytes(&[0x59, 0xd9]).is_err());
assert!(from_bytes(&[0x56, 0xe7]).is_err());
assert!(from_bytes(&[0x56, 0xa8]).is_err());
assert!(from_bytes(&[0x7e, 0xff, 0xff, 0x2a]).is_err());
}
#[test]
fn test_encode_decode_supplementary_character() -> Result<()> {
let s = String::from("\u{1F600}");
let mutf8_encoded_bytes = to_bytes(&s)?;
let decoded = from_bytes(&mutf8_encoded_bytes)?;
assert_eq!(decoded, s);
Ok(())
}
}