#[cfg(feature = "alloc")]
use alloc::borrow::ToOwned;
#[cfg(feature = "alloc")]
use alloc::vec;
use bytemuck::must_cast_slice as cast_slice;
use core::cmp::Ordering;
use core::error::Error;
use core::fmt::Write;
use core::hash::{Hash, Hasher};
use core::marker::PhantomData;
use core::ops::{Bound, Index, RangeBounds};
use core::slice::SliceIndex;
use core::{fmt, mem, ptr, slice};
#[cfg(feature = "serde")]
use serde::{
de::{self, Unexpected},
Deserialize, Deserializer, Serialize, Serializer,
};
use crate::encoding::{AlwaysValid, Encoding, RecodeCause, Utf16, Utf32, Utf8, ValidateError};
#[cfg(feature = "alloc")]
use crate::string::String;
mod iter;
use crate::encoding;
pub use iter::{CharIndices, Chars};
#[derive(Clone, Debug, PartialEq)]
pub struct RecodeError {
valid_up_to: usize,
char: char,
char_len: u8,
}
impl RecodeError {
pub fn valid_up_to(&self) -> usize {
self.valid_up_to
}
pub fn char(&self) -> char {
self.char
}
pub fn char_len(&self) -> usize {
self.char_len as usize
}
}
impl fmt::Display for RecodeError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(
f,
"Error while recoding `Str`: invalid character for output encoding '{}'",
self.char
)
}
}
impl Error for RecodeError {}
#[derive(Clone, PartialEq)]
pub struct RecodeIntoError<'a, E: Encoding> {
input_used: usize,
str: &'a Str<E>,
cause: RecodeCause,
}
impl<'a, E: Encoding> RecodeIntoError<'a, E> {
fn from_recode(err: encoding::RecodeError, str: &'a Str<E>) -> Self {
RecodeIntoError {
input_used: err.input_used(),
str,
cause: err.cause().clone(),
}
}
pub fn valid_up_to(&self) -> usize {
self.input_used
}
pub fn output_valid(&self) -> &'a Str<E> {
self.str
}
pub fn cause(&self) -> &RecodeCause {
&self.cause
}
}
impl<E: Encoding> fmt::Debug for RecodeIntoError<'_, E> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.debug_struct("RecodeIntoError")
.field("input_used", &self.input_used)
.field("str", &self.str)
.field("cause", &self.cause)
.finish()
}
}
impl<E: Encoding> fmt::Display for RecodeIntoError<'_, E> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "Error while recoding `Str` into buffer: ")?;
self.cause.write_cause(f)
}
}
impl<E: Encoding> Error for RecodeIntoError<'_, E> {}
#[repr(transparent)]
pub struct Str<E>(PhantomData<E>, [u8]);
impl<E: Encoding> Str<E> {
pub unsafe fn from_bytes_unchecked(bytes: &[u8]) -> &Str<E> {
debug_assert!(E::validate(bytes).is_ok());
let ptr = ptr::from_ref(bytes) as *const Str<E>;
unsafe { &*ptr }
}
pub unsafe fn from_bytes_unchecked_mut(bytes: &mut [u8]) -> &mut Str<E> {
debug_assert!(E::validate(bytes).is_ok());
let ptr = ptr::from_mut(bytes) as *mut Str<E>;
unsafe { &mut *ptr }
}
pub fn from_bytes(bytes: &[u8]) -> Result<&Str<E>, ValidateError> {
E::validate(bytes)?;
Ok(unsafe { Self::from_bytes_unchecked(bytes) })
}
pub fn from_bytes_mut(bytes: &mut [u8]) -> Result<&mut Str<E>, ValidateError> {
E::validate(bytes)?;
Ok(unsafe { Self::from_bytes_unchecked_mut(bytes) })
}
pub fn len(&self) -> usize {
self.as_bytes().len()
}
pub fn is_empty(&self) -> bool {
self.as_bytes().is_empty()
}
pub fn as_bytes(&self) -> &[u8] {
&self.1
}
pub unsafe fn as_bytes_mut(&mut self) -> &mut [u8] {
&mut self.1
}
fn check_bounds<R>(&self, idx: &R) -> Option<()>
where
R: RangeBounds<usize>,
{
let start = idx.start_bound();
let end = idx.end_bound();
let start_idx = match start {
Bound::Included(i) => *i,
Bound::Excluded(i) => *i + 1,
Bound::Unbounded => 0,
};
let end_idx = match end {
Bound::Included(i) => *i,
Bound::Excluded(i) => *i - 1,
Bound::Unbounded => self.as_bytes().len(),
};
if !self.is_char_boundary(start_idx) || !self.is_char_boundary(end_idx) {
None
} else {
Some(())
}
}
pub fn get<R>(&self, idx: R) -> Option<&Self>
where
R: RangeBounds<usize> + SliceIndex<[u8], Output = [u8]>,
{
self.check_bounds(&idx)?;
Some(unsafe { Str::from_bytes_unchecked(self.as_bytes().get(idx)?) })
}
pub unsafe fn get_unchecked<R>(&self, idx: R) -> &Self
where
R: RangeBounds<usize> + SliceIndex<[u8], Output = [u8]>,
{
unsafe { Str::from_bytes_unchecked(self.as_bytes().get_unchecked(idx)) }
}
pub fn get_mut<R>(&mut self, idx: R) -> Option<&mut Self>
where
R: RangeBounds<usize> + SliceIndex<[u8], Output = [u8]>,
{
self.check_bounds(&idx)?;
Some(unsafe { Str::from_bytes_unchecked_mut(self.1.get_mut(idx)?) })
}
pub unsafe fn get_unchecked_mut<R>(&mut self, idx: R) -> &mut Self
where
R: RangeBounds<usize> + SliceIndex<[u8], Output = [u8]>,
{
unsafe { Str::from_bytes_unchecked_mut(self.as_bytes_mut().get_unchecked_mut(idx)) }
}
pub fn is_char_boundary(&self, idx: usize) -> bool {
match idx.cmp(&self.len()) {
Ordering::Equal => true,
Ordering::Greater => false,
Ordering::Less => E::char_bound(self, idx),
}
}
pub fn starts_with(&self, other: &Self) -> bool {
self.as_bytes().starts_with(other.as_bytes())
}
pub fn ends_with(&self, other: &Self) -> bool {
self.as_bytes().ends_with(other.as_bytes())
}
pub fn chars(&self) -> Chars<'_, E> {
Chars::new(self)
}
pub fn char_indices(&self) -> CharIndices<'_, E> {
CharIndices::new(self)
}
pub fn copy_from(&mut self, other: &Str<E>) {
if self.len() != other.len() {
panic!(
"Source string length ({}) doesn't match destination string length ({})",
other.len(),
self.len(),
);
}
self.1.copy_from_slice(other.as_bytes());
}
pub fn split_at(&self, idx: usize) -> Option<(&Str<E>, &Str<E>)> {
if self.is_char_boundary(idx) && idx < self.len() {
let (start, end) = self.1.split_at(idx);
let start = unsafe { Str::from_bytes_unchecked(start) };
let end = unsafe { Str::from_bytes_unchecked(end) };
Some((start, end))
} else {
None
}
}
pub fn split_at_mut(&mut self, idx: usize) -> Option<(&mut Str<E>, &mut Str<E>)> {
if self.is_char_boundary(idx) && idx < self.len() {
let (start, end) = self.1.split_at_mut(idx);
let start = unsafe { Str::from_bytes_unchecked_mut(start) };
let end = unsafe { Str::from_bytes_unchecked_mut(end) };
Some((start, end))
} else {
None
}
}
pub fn recode_into<'a, E2: Encoding>(
&self,
buffer: &'a mut [u8],
) -> Result<&'a Str<E2>, RecodeIntoError<'a, E2>> {
E2::recode(self, buffer)
.map(|len| {
unsafe { Str::from_bytes_unchecked(&buffer[..len]) }
})
.map_err(|err| {
let str = unsafe { Str::from_bytes_unchecked(&buffer[..err.output_valid()]) };
RecodeIntoError::from_recode(err, str)
})
}
#[cfg(feature = "alloc")]
pub fn recode<E2: Encoding>(&self) -> Result<String<E2>, RecodeError> {
let mut ptr = self;
let mut total_len = 0;
let mut out = vec![0; self.1.len()];
loop {
match E2::recode(ptr, &mut out[total_len..]) {
Ok(len) => {
out.truncate(total_len + len);
return Ok(unsafe { String::<E2>::from_bytes_unchecked(out) });
}
Err(e) => match e.cause() {
RecodeCause::NeedSpace { .. } => {
out.resize(out.len() + self.1.len(), 0);
ptr = &ptr[e.input_used()..];
total_len += e.output_valid();
}
&RecodeCause::InvalidChar { char, len } => {
return Err(RecodeError {
valid_up_to: e.input_used(),
char,
char_len: len as u8,
});
}
},
}
}
}
#[cfg(feature = "alloc")]
pub fn recode_lossy<E2: Encoding>(&self) -> String<E2> {
let mut ptr = self;
let mut total_len = 0;
let mut out = vec![0; self.1.len()];
loop {
match E2::recode(ptr, &mut out[total_len..]) {
Ok(len) => {
out.truncate(total_len + len);
return unsafe { String::from_bytes_unchecked(out) };
}
Err(e) => match e.cause() {
RecodeCause::NeedSpace { .. } => {
out.resize(out.len() + self.1.len(), 0);
ptr = &ptr[e.input_used()..];
total_len += e.output_valid();
}
&RecodeCause::InvalidChar { char: _, len } => {
let replace_len = E2::char_len(E2::REPLACEMENT);
out.resize(out.len() + replace_len, 0);
E2::encode(E2::REPLACEMENT, &mut out[total_len + e.output_valid()..])
.unwrap();
ptr = &ptr[e.input_used() + len..];
total_len += e.output_valid() + replace_len;
}
},
}
}
}
}
impl<E: AlwaysValid> Str<E> {
pub fn from_bytes_infallible(bytes: &[u8]) -> &Str<E> {
unsafe { Self::from_bytes_unchecked(bytes) }
}
pub fn from_bytes_infallible_mut(bytes: &mut [u8]) -> &mut Str<E> {
unsafe { Self::from_bytes_unchecked_mut(bytes) }
}
}
impl Str<Utf8> {
pub unsafe fn from_utf8_unchecked(str: &[u8]) -> &Self {
Self::from_bytes_unchecked(str)
}
pub fn from_utf8(str: &[u8]) -> Result<&Self, ValidateError> {
Self::from_bytes(str)
}
pub fn from_std(value: &str) -> &Str<Utf8> {
unsafe { Self::from_bytes_unchecked(value.as_bytes()) }
}
pub fn as_std(&self) -> &str {
unsafe { core::str::from_utf8_unchecked(&self.1) }
}
}
impl Str<Utf16> {
pub unsafe fn from_utf16_unchecked(str: &[u16]) -> &Self {
Self::from_bytes_unchecked(cast_slice(str))
}
pub fn from_utf16(str: &[u16]) -> Result<&Self, ValidateError> {
Self::from_bytes(cast_slice(str))
}
}
impl Str<Utf32> {
pub unsafe fn from_utf32_unchecked(str: &[u32]) -> &Self {
Self::from_bytes_unchecked(cast_slice(str))
}
pub fn from_utf32(str: &[u32]) -> Result<&Self, ValidateError> {
Self::from_bytes(cast_slice(str))
}
pub fn from_chars(str: &[char]) -> &Self {
unsafe { Self::from_bytes_unchecked(cast_slice(str)) }
}
pub fn try_chars(&self) -> Option<&[char]> {
let len = self.1.len();
let ptr = ptr::from_ref(&self.1);
if (ptr.cast::<()>() as usize) % mem::align_of::<char>() != 0 {
None
} else {
Some(unsafe { slice::from_raw_parts(ptr.cast(), len / 4) })
}
}
}
impl<E: Encoding> fmt::Debug for Str<E> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "\"")?;
for c in self.chars() {
f.write_char(c)?;
}
write!(f, "\"{}", E::shorthand())
}
}
impl<E: Encoding> fmt::Display for Str<E> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
for c in self.chars() {
f.write_char(c)?;
}
Ok(())
}
}
impl<E: Encoding> Default for &Str<E> {
fn default() -> Self {
unsafe { Str::from_bytes_unchecked(&[]) }
}
}
#[cfg(feature = "alloc")]
impl<E: Encoding> ToOwned for Str<E> {
type Owned = String<E>;
fn to_owned(&self) -> Self::Owned {
let bytes = self.as_bytes().to_vec();
unsafe { String::from_bytes_unchecked(bytes) }
}
}
impl<E, R> Index<R> for Str<E>
where
E: Encoding,
R: RangeBounds<usize> + SliceIndex<[u8], Output = [u8]>,
{
type Output = Str<E>;
fn index(&self, index: R) -> &Self::Output {
self.get(index)
.expect("Attempted to slice string at non-character boundary")
}
}
impl<E: Encoding> PartialEq for Str<E> {
fn eq(&self, other: &Str<E>) -> bool {
self.1 == other.1
}
}
impl<E: Encoding> Eq for Str<E> {}
impl<E: Encoding> Hash for Str<E> {
fn hash<H: Hasher>(&self, state: &mut H) {
self.1.hash(state)
}
}
impl<E: Encoding> AsRef<[u8]> for Str<E> {
fn as_ref(&self) -> &[u8] {
self.as_bytes()
}
}
#[cfg(feature = "serde")]
impl<E: Encoding> Serialize for Str<E> {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
{
<[u8]>::serialize(self.as_bytes(), serializer)
}
}
#[cfg(feature = "serde")]
impl<'de, E: Encoding> Deserialize<'de> for &'de Str<E> {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: Deserializer<'de>,
{
let bytes = <&'de [u8]>::deserialize(deserializer)?;
Str::from_bytes(bytes).map_err(|_| {
#[cfg(feature = "alloc")]
let msg = &*alloc::format!("a valid string for the {} encoding", E::shorthand());
#[cfg(not(feature = "alloc"))]
let msg = "a valid string for this encoding";
de::Error::invalid_value(Unexpected::Bytes(bytes), &msg)
})
}
}
impl<'a> From<&'a Str<Utf8>> for &'a str {
fn from(value: &'a Str<Utf8>) -> Self {
value.as_std()
}
}
impl<'a> From<&'a str> for &'a Str<Utf8> {
fn from(value: &'a str) -> Self {
Str::from_std(value)
}
}
impl<'a> From<&'a [char]> for &'a Str<Utf32> {
fn from(value: &'a [char]) -> Self {
Str::from_chars(value)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[cfg(feature = "alloc")]
use crate::encoding::{Ascii, Win1252};
use alloc::vec::Vec;
#[test]
fn test_chars() {
let str = Str::from_std("Abc𐐷d");
assert_eq!(&str.chars().collect::<Vec<_>>(), &['A', 'b', 'c', '𐐷', 'd'],);
let str = Str::<Utf16>::from_utf16(&[
b'A' as u16,
b'b' as u16,
b'c' as u16,
0xD801,
0xDC37,
b'd' as u16,
])
.unwrap();
assert_eq!(&str.chars().collect::<Vec<_>>(), &['A', 'b', 'c', '𐐷', 'd'],);
let str = Str::from_chars(&['A', 'b', 'c', '𐐷', 'd']);
assert_eq!(&str.chars().collect::<Vec<_>>(), &['A', 'b', 'c', '𐐷', 'd'],);
}
#[test]
fn test_char_indices() {
let str = Str::from_std("Abc𐐷d");
assert_eq!(
&str.char_indices().collect::<Vec<_>>(),
&[(0, 'A'), (1, 'b'), (2, 'c'), (3, '𐐷'), (7, 'd')],
);
let str = Str::<Utf16>::from_utf16(&[
b'A' as u16,
b'b' as u16,
b'c' as u16,
0xD801,
0xDC37,
b'd' as u16,
])
.unwrap();
assert_eq!(
&str.char_indices().collect::<Vec<_>>(),
&[(0, 'A'), (2, 'b'), (4, 'c'), (6, '𐐷'), (10, 'd')],
);
let str = Str::from_chars(&['A', 'b', 'c', '𐐷', 'd']);
assert_eq!(
&str.char_indices().collect::<Vec<_>>(),
&[(0, 'A'), (4, 'b'), (8, 'c'), (12, '𐐷'), (16, 'd')],
);
}
#[cfg(feature = "alloc")]
#[test]
fn test_recode_small_to_large() {
let a = Str::from_std("Hello World!");
let b = a.recode::<Utf32>().unwrap();
assert_eq!(
&*b,
Str::from_chars(&['H', 'e', 'l', 'l', 'o', ' ', 'W', 'o', 'r', 'l', 'd', '!']),
);
let a = Str::from_std("A𐐷b");
let b = a.recode::<Utf16>().unwrap();
assert_eq!(
&*b,
Str::from_utf16(&[b'A' as u16, 0xD801, 0xDC37, b'b' as u16]).unwrap()
);
}
#[cfg(feature = "alloc")]
#[test]
fn test_recode_invalid_chars() {
let a = Str::from_std("A𐐷b");
let b = a.recode::<Ascii>();
assert_eq!(
b,
Err(RecodeError {
valid_up_to: 1,
char: '𐐷',
char_len: 4,
})
);
let a = Str::from_std("€𐐷b");
let b = a.recode::<Win1252>();
assert_eq!(
b,
Err(RecodeError {
valid_up_to: 3,
char: '𐐷',
char_len: 4,
})
);
}
#[cfg(feature = "alloc")]
#[test]
fn test_recode_lossy_invalid_chars() {
let a = Str::from_std("A𐐷b");
let b = a.recode_lossy::<Ascii>();
assert_eq!(&*b, Str::from_bytes(b"A\x1Ab").unwrap());
let a = Str::from_std("€𐐷b");
let b = a.recode_lossy::<Win1252>();
assert_eq!(&*b, Str::from_bytes(b"\x80\x1Ab").unwrap());
}
}