pub struct LegacyCesu8Str<'s> { /* private fields */ }Expand description
A CESU-8 or Modified UTF-8 string.
The main difference between a CESU-8/MUTF-8 string and a regular UTF-8 string is in handling of 4-byte long (in UTF-8) characters. For CESU-8/MUTF-8, these characters are instead encoded as two, three-byte long UTF-16 characters.
CESU-8 and MUTF-8 strings are encoded the same, except that MUTF-8 strings, as used by
the JVM and JNI applications, encode a nul byte (hex 00) as a UTF-8 2-byte zero
character (hex C0 80)
Implementations§
Source§impl<'s> Cesu8Str<'s>
impl<'s> Cesu8Str<'s>
Sourcepub const fn utf8_error(&self) -> Result<(), Utf8Error>
pub const fn utf8_error(&self) -> Result<(), Utf8Error>
If the string is invalid UTF-8, this returns the UTF-8 error that would occur, given str::from_utf8(cesu8.as_bytes()).unwrap_err()
§Examples
- Example 1: A valid UTF8/Ascii string
const VALID_UTF8: &[u8] = b"my valid string";
let as_str = str::from_utf8(VALID_UTF8).map(|_| ());
let as_cesu8 = Cesu8Str::from_cesu8(VALID_UTF8, Variant::Standard).unwrap();
assert_eq!(as_str, as_cesu8.utf8_error());
assert!(as_str.is_ok());- Example 2: Embedded Nuls are invalid UTF8
const INVALID_UTF8: &[u8] = b"with embedded \xC0\x80 null";
let as_str = str::from_utf8(INVALID_UTF8).map(|_| ());
let as_mutf8 = Cesu8Str::from_cesu8(INVALID_UTF8, Variant::Java).unwrap();
assert_eq!(as_str, as_mutf8.utf8_error());
let utf8_err = as_str.unwrap_err();
assert_eq!(14, utf8_err.valid_up_to());
assert_eq!(Some(1), utf8_err.error_len());Sourcepub fn into_owned(self) -> Cesu8Str<'static>
pub fn into_owned(self) -> Cesu8Str<'static>
Ensures the string is owned to allievate any lifetime issues
Sourcepub fn from_cesu8(
bytes: &[u8],
variant: Variant,
) -> Result<Cesu8Str<'_>, Cesu8Error>
pub fn from_cesu8( bytes: &[u8], variant: Variant, ) -> Result<Cesu8Str<'_>, Cesu8Error>
Validates a sequence of bytes as CESU8, will not allocate.
§Examples
§Valid CESU-8, Valid UTF-8, Valid ascii
use cesu8str::{LegacyCesu8Str as Cesu8Str, Variant};
const ASCII: &[u8] = b"normal ascii string";
let as_cesu8 = Cesu8Str::from_cesu8(ASCII, Variant::Standard).unwrap();
// There were no UTF-8 errors within the string
assert_eq!(from_utf8(ASCII).map(|_| ()), as_cesu8.utf8_error());
assert_eq!(as_cesu8.utf8_error(), Ok(()));§Valid CESU-8, Invalid UTF-8
use cesu8str::{LegacyCesu8Str as Cesu8Str, Variant};
const VALID_CESU8: &[u8] = b"with embedded \xC0\x80 null";
let as_cesu8 = Cesu8Str::from_cesu8(VALID_CESU8, Variant::Java).unwrap();
// It's not valid UTF-8, check the utf8_error
assert_eq!(from_utf8(VALID_CESU8).map(|_| ()), as_cesu8.utf8_error());
let utf8_err = as_cesu8.utf8_error().unwrap_err();
assert_eq!(14, utf8_err.valid_up_to());
assert_eq!(Some(1), utf8_err.error_len());§Invalid CESU-8, Invalid UTF-8
use cesu8str::{LegacyCesu8Str as Cesu8Str, Variant};
const INVALID_CESU8: &[u8] = b"with embedded \xC0\x80 null"; // is valid Java variant, but test with Standard so it's invalid
let as_cesu8_err = Cesu8Str::from_cesu8(INVALID_CESU8, Variant::Standard).unwrap_err();
assert_eq!(14, as_cesu8_err.valid_up_to());
assert_eq!(from_utf8(INVALID_CESU8).map(|_| ()), as_cesu8_err.utf8_error());
let valid = &INVALID_CESU8[..as_cesu8_err.valid_up_to()];
let as_cesu8 = Cesu8Str::from_cesu8(valid, Variant::Standard).unwrap();
assert_eq!(from_utf8(valid).map(|_| ()), as_cesu8.utf8_error());
assert_eq!(Ok(()), as_cesu8.utf8_error());
§Invalid CESU-8, Valid UTF-8
use cesu8str::{LegacyCesu8Str as Cesu8Str, Variant};
const VALID_UTF8: &str = "with literal \0 null";
let as_cesu8_err = Cesu8Str::from_cesu8(VALID_UTF8.as_bytes(), Variant::Java).unwrap_err();
assert_eq!(std::str::from_utf8(VALID_UTF8.as_bytes()).map(|_| ()), as_cesu8_err.utf8_error());Sourcepub fn from_cesu8_lossy(bytes: &[u8], variant: Variant) -> Cesu8Str<'_>
pub fn from_cesu8_lossy(bytes: &[u8], variant: Variant) -> Cesu8Str<'_>
Creates a valid CESU-8 string, replacing invalid sequences with a replacement character.
If the string is already valid, it will not allocate. Otherwise, it will allocate a new buffer.
Note that if an invalid is found at the end (such as incomplete sequences), they will be replaced, even if more bytes in the buffer could fix it
Sourcepub unsafe fn from_utf8_unchecked(
bytes: Cow<'_, str>,
variant: Variant,
) -> Cesu8Str<'_>
pub unsafe fn from_utf8_unchecked( bytes: Cow<'_, str>, variant: Variant, ) -> Cesu8Str<'_>
Creates a Cesu8Str from a UTF-8 string.
§Safety
The internal CESU-8 string must not contain invalid CESU-8 sequences.
Namely, there must not be 4-byte UTF-8 supplementary characters, and, if this is the Java variant, there must not be any nul-bytes.
Sourcepub fn from_utf8<C: Into<Cow<'s, str>>>(
text: C,
variant: Variant,
) -> Cesu8Str<'s>
pub fn from_utf8<C: Into<Cow<'s, str>>>( text: C, variant: Variant, ) -> Cesu8Str<'s>
Converts a UTF-8 string into a CESU-8 string, allocating if necessary.
use cesu8str::{LegacyCesu8Str as Cesu8Str, Variant};
// Encode a UTF-8 str (that is also valid CESU-8) into CESU-8 without allocating
let to_encode = "my string (valid CESU8)";
let as_cesu8 = Cesu8Str::from_utf8(to_encode, Variant::Standard);
assert!(matches!(as_cesu8.into_bytes(), Cow::Borrowed(_)));
// Encode a UTF-8 str into Java CESU-8. Will allocate since it has to encode the nul byte.
let to_encode_java = "my string (not valid Java CESU8)\0";
let as_jcesu8 = Cesu8Str::from_utf8(to_encode_java, Variant::Java);
assert!(matches!(as_jcesu8.into_bytes(), Cow::Owned(_)));
// Encode an owned UTF-8 String into CESU-8. Will not allocate since the string is already owned.
let to_encode = "my string (valid CESU8)".to_owned();
let as_cesu8 = Cesu8Str::from_utf8(to_encode, Variant::Standard);
assert!(matches!(as_cesu8.into_bytes(), Cow::Owned(_)));Sourcepub fn try_from_utf8<C: Into<Cow<'s, str>>>(
text: C,
variant: Variant,
) -> Result<Cesu8Str<'s>, Cesu8Error>
pub fn try_from_utf8<C: Into<Cow<'s, str>>>( text: C, variant: Variant, ) -> Result<Cesu8Str<'s>, Cesu8Error>
Validates a UTF-8 string as a CESU-8 string. Will return an error if it cannot do so without allocating.
See Cesu8Str::from_utf8 for a version that will convert (and allocate if necessary)
Sourcepub fn from_utf8_inplace(
text: &'s str,
buf: &'s mut [u8],
variant: Variant,
) -> Result<Cesu8Str<'s>>
pub fn from_utf8_inplace( text: &'s str, buf: &'s mut [u8], variant: Variant, ) -> Result<Cesu8Str<'s>>
Creates a Cesu8Str into a provided buffer. Alternatively, the string could borrow from the original string if it is valid CESU8.
May return an io::Error if there is not enough space in the provided buffer, in which case the buffer’s contents is undefined.
Sourcepub fn from_utf8_writer<W: Write>(
text: &str,
target: &mut W,
variant: Variant,
) -> Result<()>
pub fn from_utf8_writer<W: Write>( text: &str, target: &mut W, variant: Variant, ) -> Result<()>
Converts a UTF-8 string directly into the provided io::Write-capable object. This allows writing directly into a preallocated Vec or byte slice stored on the stack, for example.
Sourcepub fn as_str(&self) -> Result<&str, Utf8Error>
pub fn as_str(&self) -> Result<&str, Utf8Error>
Returns the CESU-8 string as a UTF-8 string without allocating.
Sourcepub fn to_str(&self) -> Cow<'_, str>
pub fn to_str(&self) -> Cow<'_, str>
Returns the CESU-8 string as a UTF-8 string, may allocate.
Sourcepub fn into_str(self) -> Cow<'s, str>
pub fn into_str(self) -> Cow<'s, str>
Returns the CESU-8 string as a UTF-8 string, preserving the allocation if possible.
Sourcepub fn into_bytes(self) -> Cow<'s, [u8]>
pub fn into_bytes(self) -> Cow<'s, [u8]>
Returns the underlying bytes that make up the CESU-8 string.
Sourcepub fn to_variant(&self, variant: Variant) -> Cesu8Str<'_>
pub fn to_variant(&self, variant: Variant) -> Cesu8Str<'_>
Converts between variants
Sourcepub fn into_variant(self, variant: Variant) -> Cesu8Str<'s>
pub fn into_variant(self, variant: Variant) -> Cesu8Str<'s>
Encodes this string into the specified variant. No-op if already encoded in the variant.
Trait Implementations§
Source§impl<'cs, 'us> AddAssign<&'us Cesu8Str<'us>> for Cesu8Str<'cs>
impl<'cs, 'us> AddAssign<&'us Cesu8Str<'us>> for Cesu8Str<'cs>
Source§fn add_assign(&mut self, rhs: &'us Cesu8Str<'us>)
fn add_assign(&mut self, rhs: &'us Cesu8Str<'us>)
+= operation. Read moreSource§impl<'cs, 'us> AddAssign<&'us str> for Cesu8Str<'cs>
impl<'cs, 'us> AddAssign<&'us str> for Cesu8Str<'cs>
Source§fn add_assign(&mut self, text: &'us str)
fn add_assign(&mut self, text: &'us str)
+= operation. Read more