use std::{
borrow::{Borrow, Cow},
fmt, mem,
ops::Deref,
simd::prelude::*,
};
#[derive(Debug, Eq, PartialEq)]
pub struct Mutf8Str {
pub(crate) slice: [u8],
}
#[derive(Debug, Eq, PartialEq)]
pub struct Mutf8String {
vec: Vec<u8>,
}
#[inline]
fn is_plain_ascii(slice: &[u8]) -> bool {
let mut is_plain_ascii = true;
let chunks_32_exact = slice.array_chunks::<32>();
let mut remainder = chunks_32_exact.remainder();
if remainder.len() > 16 {
let chunk;
(chunk, remainder) = remainder.split_array_ref::<16>();
let mask = u8x16::splat(0b10000000);
let zero = u8x16::splat(0);
let simd = u8x16::from_array(*chunk);
let xor = simd & mask;
if xor != zero {
is_plain_ascii = false;
}
}
if remainder.len() > 8 {
let (chunk, remainder) = remainder.split_array_ref::<8>();
let mask = u8x8::splat(0b10000000);
let zero = u8x8::splat(0);
let simd = u8x8::from_array(*chunk);
let xor = simd & mask;
if xor != zero {
is_plain_ascii = false;
}
for &byte in remainder {
if byte & 0b10000000 != 0 {
is_plain_ascii = false;
}
}
}
for &chunk in chunks_32_exact {
let mask = u8x32::splat(0b10000000);
let zero = u8x32::splat(0);
let simd = u8x32::from_array(chunk);
let xor = simd & mask;
if xor != zero {
is_plain_ascii = false;
}
}
is_plain_ascii
}
impl Mutf8Str {
pub fn to_string_lossy(&self) -> Cow<str> {
String::from_utf8_lossy(&self.slice)
}
#[inline]
pub fn from_slice(slice: &[u8]) -> &Mutf8Str {
unsafe { mem::transmute::<&[u8], &Mutf8Str>(slice) }
}
#[allow(clippy::should_implement_trait)]
pub fn from_str(s: &str) -> Cow<Mutf8Str> {
match mutf8::encode(s) {
Cow::Borrowed(b) => Cow::Borrowed(Mutf8Str::from_slice(b)),
Cow::Owned(o) => Cow::Owned(Mutf8String { vec: o }),
}
}
pub fn to_str(&self) -> Cow<str> {
if is_plain_ascii(&self.slice) {
unsafe { Cow::Borrowed(std::str::from_utf8_unchecked(&self.slice)) }
} else {
match mutf8::decode(&self.slice).expect("Mutf8Str must alwaus be valid MUTF-8") {
Cow::Borrowed(b) => Cow::Borrowed(b),
Cow::Owned(o) => Cow::Owned(o),
}
}
}
}
impl fmt::Display for Mutf8Str {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.write_str(&self.to_str())
}
}
impl ToOwned for Mutf8Str {
type Owned = Mutf8String;
fn to_owned(&self) -> Self::Owned {
Mutf8String {
vec: self.slice.to_vec(),
}
}
}
impl Borrow<Mutf8Str> for Mutf8String {
fn borrow(&self) -> &Mutf8Str {
self.as_str()
}
}
impl Mutf8String {
#[inline]
pub fn as_str(&self) -> &Mutf8Str {
Mutf8Str::from_slice(self.vec.as_slice())
}
pub fn into_string(self) -> String {
if is_plain_ascii(&self.vec) {
unsafe { String::from_utf8_unchecked(self.vec) }
} else {
match mutf8::decode(&self.vec).expect("Mutf8Str must alwaus be valid MUTF-8") {
Cow::Borrowed(b) => b.to_owned(),
Cow::Owned(o) => o,
}
}
}
}
impl Deref for Mutf8String {
type Target = Mutf8Str;
fn deref(&self) -> &Self::Target {
self.as_str()
}
}
#[cfg(test)]
mod tests {
use std::borrow::Cow;
use crate::mutf8::Mutf8Str;
#[test]
fn same_as_utf8() {
let str = "Hello, world!";
assert_eq!(
Mutf8Str::from_str(str),
Cow::Borrowed(Mutf8Str::from_slice(str.as_bytes()))
);
assert_eq!(Mutf8Str::from_str(str).to_str(), Cow::Borrowed(str));
}
#[test]
fn surrogate_pairs() {
let str = "\u{10401}";
let mutf8_data = &[0xED, 0xA0, 0x81, 0xED, 0xB0, 0x81];
assert_eq!(
Mutf8Str::from_slice(mutf8_data).to_str(),
Cow::Borrowed(str)
);
}
#[test]
fn null_bytes() {
let str = "\0";
let mutf8_data = vec![0xC0, 0x80];
assert_eq!(
Mutf8Str::from_slice(&mutf8_data).to_str(),
Cow::Borrowed(str)
);
}
}