#![no_std]
#![allow(clippy::precedence, clippy::match_overlapping_arm)]
extern crate alloc;
use alloc::borrow::{Cow, ToOwned};
use alloc::boxed::Box;
use alloc::collections::TryReserveError;
use alloc::string::String;
use alloc::vec::Vec;
use core::borrow::Borrow;
use core::fmt;
use core::hash::{Hash, Hasher};
use core::iter::FusedIterator;
use core::mem;
use core::ops;
use core::slice;
use core::str;
use core_char::MAX_LEN_UTF8;
use core_char::{MAX_LEN_UTF16, encode_utf8_raw, encode_utf16_raw, len_utf8};
use core_str::{next_code_point, next_code_point_reverse};
use itertools::{Either, Itertools};
use bstr::{ByteSlice, ByteVec};
mod core_char;
mod core_str;
mod core_str_count;
const UTF8_REPLACEMENT_CHARACTER: &str = "\u{FFFD}";
#[derive(Eq, PartialEq, Ord, PartialOrd, Clone, Copy)]
pub struct CodePoint {
value: u32,
}
impl fmt::Debug for CodePoint {
#[inline]
fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(formatter, "U+{:04X}", self.value)
}
}
impl fmt::Display for CodePoint {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
self.to_char_lossy().fmt(f)
}
}
impl CodePoint {
#[inline]
pub const unsafe fn from_u32_unchecked(value: u32) -> CodePoint {
CodePoint { value }
}
#[inline]
pub const fn from_u32(value: u32) -> Option<CodePoint> {
match value {
0..=0x10FFFF => Some(CodePoint { value }),
_ => None,
}
}
#[inline]
pub const fn from_char(value: char) -> CodePoint {
CodePoint {
value: value as u32,
}
}
#[inline]
pub const fn to_u32(self) -> u32 {
self.value
}
#[inline]
pub const fn to_lead_surrogate(self) -> Option<LeadSurrogate> {
match self.value {
lead @ 0xD800..=0xDBFF => Some(LeadSurrogate(lead as u16)),
_ => None,
}
}
#[inline]
pub const fn to_trail_surrogate(self) -> Option<TrailSurrogate> {
match self.value {
trail @ 0xDC00..=0xDFFF => Some(TrailSurrogate(trail as u16)),
_ => None,
}
}
#[inline]
pub const fn to_char(self) -> Option<char> {
match self.value {
0xD800..=0xDFFF => None,
_ => Some(unsafe { char::from_u32_unchecked(self.value) }),
}
}
#[inline]
pub fn to_char_lossy(self) -> char {
self.to_char().unwrap_or('\u{FFFD}')
}
pub fn is_char_and(self, f: impl FnOnce(char) -> bool) -> bool {
self.to_char().is_some_and(f)
}
pub fn encode_wtf8(self, dst: &mut [u8]) -> &mut Wtf8 {
unsafe { Wtf8::from_mut_bytes_unchecked(encode_utf8_raw(self.value, dst)) }
}
pub const fn len_wtf8(&self) -> usize {
len_utf8(self.value)
}
pub fn is_ascii(&self) -> bool {
self.is_char_and(|c| c.is_ascii())
}
}
impl From<u16> for CodePoint {
fn from(value: u16) -> Self {
unsafe { Self::from_u32_unchecked(value.into()) }
}
}
impl From<u8> for CodePoint {
fn from(value: u8) -> Self {
char::from(value).into()
}
}
impl From<char> for CodePoint {
fn from(value: char) -> Self {
Self::from_char(value)
}
}
impl From<ascii::AsciiChar> for CodePoint {
fn from(value: ascii::AsciiChar) -> Self {
Self::from_char(value.into())
}
}
impl From<CodePoint> for Wtf8Buf {
fn from(ch: CodePoint) -> Self {
ch.encode_wtf8(&mut [0; MAX_LEN_UTF8]).to_owned()
}
}
impl PartialEq<char> for CodePoint {
fn eq(&self, other: &char) -> bool {
self.to_u32() == *other as u32
}
}
impl PartialEq<CodePoint> for char {
fn eq(&self, other: &CodePoint) -> bool {
*self as u32 == other.to_u32()
}
}
#[derive(Clone, Copy)]
pub struct LeadSurrogate(u16);
#[derive(Clone, Copy)]
pub struct TrailSurrogate(u16);
impl LeadSurrogate {
pub const fn merge(self, trail: TrailSurrogate) -> char {
decode_surrogate_pair(self.0, trail.0)
}
}
#[derive(Eq, PartialEq, Ord, PartialOrd, Clone, Default)]
pub struct Wtf8Buf {
bytes: Vec<u8>,
}
impl ops::Deref for Wtf8Buf {
type Target = Wtf8;
fn deref(&self) -> &Wtf8 {
self.as_slice()
}
}
impl ops::DerefMut for Wtf8Buf {
fn deref_mut(&mut self) -> &mut Wtf8 {
self.as_mut_slice()
}
}
impl Borrow<Wtf8> for Wtf8Buf {
fn borrow(&self) -> &Wtf8 {
self
}
}
impl fmt::Debug for Wtf8Buf {
#[inline]
fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
fmt::Debug::fmt(&**self, formatter)
}
}
impl fmt::Display for Wtf8Buf {
fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
fmt::Display::fmt(&**self, formatter)
}
}
impl Wtf8Buf {
#[inline]
pub fn new() -> Wtf8Buf {
Wtf8Buf::default()
}
#[inline]
pub fn with_capacity(capacity: usize) -> Wtf8Buf {
Wtf8Buf {
bytes: Vec::with_capacity(capacity),
}
}
#[inline]
pub const unsafe fn from_bytes_unchecked(value: Vec<u8>) -> Wtf8Buf {
Wtf8Buf { bytes: value }
}
pub fn from_bytes(value: Vec<u8>) -> Result<Self, Vec<u8>> {
match Wtf8::from_bytes(&value) {
Some(_) => Ok(unsafe { Self::from_bytes_unchecked(value) }),
None => Err(value),
}
}
#[inline]
pub fn from_string(string: String) -> Wtf8Buf {
Wtf8Buf {
bytes: string.into_bytes(),
}
}
pub fn join<I, S>(sep: impl AsRef<Wtf8>, iter: I) -> Wtf8Buf
where
I: IntoIterator<Item = S>,
S: AsRef<Wtf8>,
{
let sep = sep.as_ref();
let mut iter = iter.into_iter();
let mut buf = match iter.next() {
Some(first) => first.as_ref().to_owned(),
None => return Wtf8Buf::new(),
};
for part in iter {
buf.push_wtf8(sep);
buf.push_wtf8(part.as_ref());
}
buf
}
pub fn clear(&mut self) {
self.bytes.clear();
}
pub fn from_wide(v: &[u16]) -> Wtf8Buf {
let mut string = Wtf8Buf::with_capacity(v.len());
for item in char::decode_utf16(v.iter().cloned()) {
match item {
Ok(ch) => string.push_char(ch),
Err(surrogate) => {
let surrogate = surrogate.unpaired_surrogate();
let code_point = CodePoint::from(surrogate);
string.push(code_point);
}
}
}
string
}
#[inline]
pub fn as_slice(&self) -> &Wtf8 {
unsafe { Wtf8::from_bytes_unchecked(&self.bytes) }
}
#[inline]
pub fn as_mut_slice(&mut self) -> &mut Wtf8 {
unsafe { Wtf8::from_mut_bytes_unchecked(&mut self.bytes) }
}
#[inline]
pub fn reserve(&mut self, additional: usize) {
self.bytes.reserve(additional)
}
#[inline]
pub fn try_reserve(&mut self, additional: usize) -> Result<(), TryReserveError> {
self.bytes.try_reserve(additional)
}
#[inline]
pub fn reserve_exact(&mut self, additional: usize) {
self.bytes.reserve_exact(additional)
}
#[inline]
pub fn try_reserve_exact(&mut self, additional: usize) -> Result<(), TryReserveError> {
self.bytes.try_reserve_exact(additional)
}
#[inline]
pub fn shrink_to_fit(&mut self) {
self.bytes.shrink_to_fit()
}
#[inline]
pub fn shrink_to(&mut self, min_capacity: usize) {
self.bytes.shrink_to(min_capacity)
}
#[inline]
pub fn leak<'a>(self) -> &'a mut Wtf8 {
unsafe { Wtf8::from_mut_bytes_unchecked(self.bytes.leak()) }
}
#[inline]
pub const fn capacity(&self) -> usize {
self.bytes.capacity()
}
#[inline]
pub fn push_str(&mut self, other: &str) {
self.bytes.extend_from_slice(other.as_bytes())
}
#[inline]
pub fn push_wtf8(&mut self, other: &Wtf8) {
self.bytes.extend_from_slice(&other.bytes);
}
#[inline]
pub fn push_char(&mut self, c: char) {
self.push(CodePoint::from_char(c))
}
#[inline]
pub fn push(&mut self, code_point: CodePoint) {
self.push_wtf8(code_point.encode_wtf8(&mut [0; MAX_LEN_UTF8]))
}
pub fn pop(&mut self) -> Option<CodePoint> {
let ch = self.code_points().next_back()?;
let new_len = self.len() - ch.len_wtf8();
self.bytes.truncate(new_len);
Some(ch)
}
#[inline]
pub fn truncate(&mut self, new_len: usize) {
assert!(is_code_point_boundary(self, new_len));
self.bytes.truncate(new_len)
}
#[inline]
pub fn insert(&mut self, idx: usize, c: CodePoint) {
self.insert_wtf8(idx, c.encode_wtf8(&mut [0; MAX_LEN_UTF8]))
}
#[inline]
pub fn insert_wtf8(&mut self, idx: usize, w: &Wtf8) {
assert!(is_code_point_boundary(self, idx));
self.bytes.insert_str(idx, w)
}
#[inline]
pub fn into_bytes(self) -> Vec<u8> {
self.bytes
}
pub fn into_string(self) -> Result<String, Wtf8Buf> {
if self.is_utf8() {
Ok(unsafe { String::from_utf8_unchecked(self.bytes) })
} else {
Err(self)
}
}
pub fn into_string_lossy(mut self) -> String {
let mut pos = 0;
while let Some((surrogate_pos, _)) = self.next_surrogate(pos) {
pos = surrogate_pos + 3;
self.bytes[surrogate_pos..pos].copy_from_slice(UTF8_REPLACEMENT_CHARACTER.as_bytes());
}
unsafe { String::from_utf8_unchecked(self.bytes) }
}
#[inline]
pub fn into_box(self) -> Box<Wtf8> {
unsafe { mem::transmute(self.bytes.into_boxed_slice()) }
}
pub fn from_box(boxed: Box<Wtf8>) -> Wtf8Buf {
let bytes: Box<[u8]> = unsafe { mem::transmute(boxed) };
Wtf8Buf {
bytes: bytes.into_vec(),
}
}
}
impl FromIterator<CodePoint> for Wtf8Buf {
fn from_iter<T: IntoIterator<Item = CodePoint>>(iter: T) -> Wtf8Buf {
let mut string = Wtf8Buf::new();
string.extend(iter);
string
}
}
impl Extend<CodePoint> for Wtf8Buf {
fn extend<T: IntoIterator<Item = CodePoint>>(&mut self, iter: T) {
let iterator = iter.into_iter();
let (low, _high) = iterator.size_hint();
self.bytes.reserve(low);
iterator.for_each(move |code_point| self.push(code_point));
}
}
impl Extend<char> for Wtf8Buf {
fn extend<T: IntoIterator<Item = char>>(&mut self, iter: T) {
self.extend(iter.into_iter().map(CodePoint::from))
}
}
impl<W: AsRef<Wtf8>> Extend<W> for Wtf8Buf {
fn extend<T: IntoIterator<Item = W>>(&mut self, iter: T) {
iter.into_iter()
.for_each(move |w| self.push_wtf8(w.as_ref()));
}
}
impl<W: AsRef<Wtf8>> FromIterator<W> for Wtf8Buf {
fn from_iter<T: IntoIterator<Item = W>>(iter: T) -> Self {
let mut buf = Wtf8Buf::new();
iter.into_iter().for_each(|w| buf.push_wtf8(w.as_ref()));
buf
}
}
impl Hash for Wtf8Buf {
fn hash<H: Hasher>(&self, state: &mut H) {
Wtf8::hash(self, state)
}
}
impl AsRef<Wtf8> for Wtf8Buf {
fn as_ref(&self) -> &Wtf8 {
self
}
}
impl From<String> for Wtf8Buf {
fn from(s: String) -> Self {
Wtf8Buf::from_string(s)
}
}
impl From<&str> for Wtf8Buf {
fn from(s: &str) -> Self {
Wtf8Buf::from_string(s.to_owned())
}
}
impl From<ascii::AsciiString> for Wtf8Buf {
fn from(s: ascii::AsciiString) -> Self {
Wtf8Buf::from_string(s.into())
}
}
#[derive(PartialEq, Eq, PartialOrd, Ord)]
pub struct Wtf8 {
bytes: [u8],
}
impl AsRef<Wtf8> for Wtf8 {
fn as_ref(&self) -> &Wtf8 {
self
}
}
impl ToOwned for Wtf8 {
type Owned = Wtf8Buf;
fn to_owned(&self) -> Self::Owned {
self.to_wtf8_buf()
}
fn clone_into(&self, buf: &mut Self::Owned) {
self.bytes.clone_into(&mut buf.bytes);
}
}
impl PartialEq<str> for Wtf8 {
fn eq(&self, other: &str) -> bool {
self.as_bytes().eq(other.as_bytes())
}
}
impl fmt::Debug for Wtf8 {
fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
fn write_str_escaped(f: &mut fmt::Formatter<'_>, s: &str) -> fmt::Result {
use core::fmt::Write;
for c in s.chars().flat_map(|c| c.escape_debug()) {
f.write_char(c)?
}
Ok(())
}
formatter.write_str("\"")?;
let mut pos = 0;
while let Some((surrogate_pos, surrogate)) = self.next_surrogate(pos) {
write_str_escaped(formatter, unsafe {
str::from_utf8_unchecked(&self.bytes[pos..surrogate_pos])
})?;
write!(formatter, "\\u{{{surrogate:x}}}")?;
pos = surrogate_pos + 3;
}
write_str_escaped(formatter, unsafe {
str::from_utf8_unchecked(&self.bytes[pos..])
})?;
formatter.write_str("\"")
}
}
impl fmt::Display for Wtf8 {
fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
let wtf8_bytes = &self.bytes;
let mut pos = 0;
loop {
match self.next_surrogate(pos) {
Some((surrogate_pos, _)) => {
formatter.write_str(unsafe {
str::from_utf8_unchecked(&wtf8_bytes[pos..surrogate_pos])
})?;
formatter.write_str(UTF8_REPLACEMENT_CHARACTER)?;
pos = surrogate_pos + 3;
}
None => {
let s = unsafe { str::from_utf8_unchecked(&wtf8_bytes[pos..]) };
if pos == 0 {
return s.fmt(formatter);
} else {
return formatter.write_str(s);
}
}
}
}
}
}
impl Default for &Wtf8 {
fn default() -> Self {
unsafe { Wtf8::from_bytes_unchecked(&[]) }
}
}
impl Hash for Wtf8 {
fn hash<H: Hasher>(&self, state: &mut H) {
state.write(self.as_bytes());
state.write_u8(0xff);
}
}
impl Wtf8 {
#[inline]
pub fn new<S: AsRef<Wtf8> + ?Sized>(value: &S) -> &Wtf8 {
value.as_ref()
}
#[inline]
pub const unsafe fn from_bytes_unchecked(value: &[u8]) -> &Wtf8 {
unsafe { &*(value as *const [u8] as *const Wtf8) }
}
#[inline]
const unsafe fn from_mut_bytes_unchecked(value: &mut [u8]) -> &mut Wtf8 {
unsafe { &mut *(value as *mut [u8] as *mut Wtf8) }
}
#[inline]
pub fn from_bytes(b: &[u8]) -> Option<&Self> {
let mut rest = b;
while let Err(e) = core::str::from_utf8(rest) {
rest = &rest[e.valid_up_to()..];
let _ = Self::decode_surrogate(rest)?;
rest = &rest[3..];
}
Some(unsafe { Wtf8::from_bytes_unchecked(b) })
}
fn decode_surrogate(b: &[u8]) -> Option<CodePoint> {
let [0xed, b2 @ (0xa0..), b3, ..] = *b else {
return None;
};
Some(decode_surrogate(b2, b3).into())
}
#[inline]
pub const fn len(&self) -> usize {
self.bytes.len()
}
#[inline]
pub const fn is_empty(&self) -> bool {
self.bytes.is_empty()
}
#[inline]
pub const fn ascii_byte_at(&self, position: usize) -> u8 {
match self.bytes[position] {
ascii_byte @ 0x00..=0x7F => ascii_byte,
_ => 0xFF,
}
}
#[inline]
pub fn code_points(&self) -> Wtf8CodePoints<'_> {
Wtf8CodePoints {
bytes: self.bytes.iter(),
}
}
#[inline]
pub fn code_point_indices(&self) -> Wtf8CodePointIndices<'_> {
Wtf8CodePointIndices {
front_offset: 0,
iter: self.code_points(),
}
}
#[inline]
pub const fn as_bytes(&self) -> &[u8] {
&self.bytes
}
#[inline]
pub const fn as_str(&self) -> Result<&str, str::Utf8Error> {
str::from_utf8(&self.bytes)
}
pub fn to_wtf8_buf(&self) -> Wtf8Buf {
Wtf8Buf {
bytes: self.bytes.to_vec(),
}
}
pub fn to_string_lossy(&self) -> Cow<'_, str> {
let Some((surrogate_pos, _)) = self.next_surrogate(0) else {
return Cow::Borrowed(unsafe { str::from_utf8_unchecked(&self.bytes) });
};
let wtf8_bytes = &self.bytes;
let mut utf8_bytes = Vec::with_capacity(self.len());
utf8_bytes.extend_from_slice(&wtf8_bytes[..surrogate_pos]);
utf8_bytes.extend_from_slice(UTF8_REPLACEMENT_CHARACTER.as_bytes());
let mut pos = surrogate_pos + 3;
loop {
match self.next_surrogate(pos) {
Some((surrogate_pos, _)) => {
utf8_bytes.extend_from_slice(&wtf8_bytes[pos..surrogate_pos]);
utf8_bytes.extend_from_slice(UTF8_REPLACEMENT_CHARACTER.as_bytes());
pos = surrogate_pos + 3;
}
None => {
utf8_bytes.extend_from_slice(&wtf8_bytes[pos..]);
return Cow::Owned(unsafe { String::from_utf8_unchecked(utf8_bytes) });
}
}
}
}
#[inline]
pub fn encode_wide(&self) -> EncodeWide<'_> {
EncodeWide {
code_points: self.code_points(),
extra: 0,
}
}
pub const fn chunks(&self) -> Wtf8Chunks<'_> {
Wtf8Chunks { wtf8: self }
}
pub fn map_utf8<'a, I>(&'a self, f: impl Fn(&'a str) -> I) -> impl Iterator<Item = CodePoint>
where
I: Iterator<Item = char>,
{
self.chunks().flat_map(move |chunk| match chunk {
Wtf8Chunk::Utf8(s) => Either::Left(f(s).map_into()),
Wtf8Chunk::Surrogate(c) => Either::Right(core::iter::once(c)),
})
}
#[inline]
fn next_surrogate(&self, mut pos: usize) -> Option<(usize, u16)> {
let mut iter = self.bytes[pos..].iter();
loop {
let b = *iter.next()?;
if b < 0x80 {
pos += 1;
} else if b < 0xE0 {
iter.next();
pos += 2;
} else if b == 0xED {
match (iter.next(), iter.next()) {
(Some(&b2), Some(&b3)) if b2 >= 0xA0 => {
return Some((pos, decode_surrogate(b2, b3)));
}
_ => pos += 3,
}
} else if b < 0xF0 {
iter.next();
iter.next();
pos += 3;
} else {
iter.next();
iter.next();
iter.next();
pos += 4;
}
}
}
pub fn is_code_point_boundary(&self, index: usize) -> bool {
is_code_point_boundary(self, index)
}
#[inline]
pub fn into_box(&self) -> Box<Wtf8> {
let boxed: Box<[u8]> = self.bytes.into();
unsafe { mem::transmute(boxed) }
}
pub fn empty_box() -> Box<Wtf8> {
let boxed: Box<[u8]> = Default::default();
unsafe { mem::transmute(boxed) }
}
#[inline]
pub fn make_ascii_lowercase(&mut self) {
self.bytes.make_ascii_lowercase()
}
#[inline]
pub fn make_ascii_uppercase(&mut self) {
self.bytes.make_ascii_uppercase()
}
#[inline]
pub fn to_ascii_lowercase(&self) -> Wtf8Buf {
Wtf8Buf {
bytes: self.bytes.to_ascii_lowercase(),
}
}
#[inline]
pub fn to_ascii_uppercase(&self) -> Wtf8Buf {
Wtf8Buf {
bytes: self.bytes.to_ascii_uppercase(),
}
}
pub fn to_lowercase(&self) -> Wtf8Buf {
let mut buf = Wtf8Buf::with_capacity(self.len());
for chunk in self.chunks() {
match chunk {
Wtf8Chunk::Utf8(s) => buf.push_str(&s.to_lowercase()),
Wtf8Chunk::Surrogate(c) => buf.push(c),
}
}
buf
}
pub fn to_uppercase(&self) -> Wtf8Buf {
let mut buf = Wtf8Buf::with_capacity(self.len());
for chunk in self.chunks() {
match chunk {
Wtf8Chunk::Utf8(s) => buf.push_str(&s.to_uppercase()),
Wtf8Chunk::Surrogate(c) => buf.push(c),
}
}
buf
}
#[inline]
pub const fn is_ascii(&self) -> bool {
self.bytes.is_ascii()
}
#[inline]
pub fn is_utf8(&self) -> bool {
self.next_surrogate(0).is_none()
}
#[inline]
pub fn eq_ignore_ascii_case(&self, other: &Self) -> bool {
self.bytes.eq_ignore_ascii_case(&other.bytes)
}
pub fn split(&self, pat: &Wtf8) -> impl Iterator<Item = &Self> {
self.as_bytes()
.split_str(pat)
.map(|w| unsafe { Wtf8::from_bytes_unchecked(w) })
}
pub fn splitn(&self, n: usize, pat: &Wtf8) -> impl Iterator<Item = &Self> {
self.as_bytes()
.splitn_str(n, pat)
.map(|w| unsafe { Wtf8::from_bytes_unchecked(w) })
}
pub fn rsplit(&self, pat: &Wtf8) -> impl Iterator<Item = &Self> {
self.as_bytes()
.rsplit_str(pat)
.map(|w| unsafe { Wtf8::from_bytes_unchecked(w) })
}
pub fn rsplitn(&self, n: usize, pat: &Wtf8) -> impl Iterator<Item = &Self> {
self.as_bytes()
.rsplitn_str(n, pat)
.map(|w| unsafe { Wtf8::from_bytes_unchecked(w) })
}
pub fn trim(&self) -> &Self {
let w = self.bytes.trim();
unsafe { Wtf8::from_bytes_unchecked(w) }
}
pub fn trim_start(&self) -> &Self {
let w = self.bytes.trim_start();
unsafe { Wtf8::from_bytes_unchecked(w) }
}
pub fn trim_end(&self) -> &Self {
let w = self.bytes.trim_end();
unsafe { Wtf8::from_bytes_unchecked(w) }
}
pub fn trim_start_matches(&self, f: impl Fn(CodePoint) -> bool) -> &Self {
let mut iter = self.code_points();
loop {
let old = iter.clone();
match iter.next().map(&f) {
Some(true) => continue,
Some(false) => {
iter = old;
break;
}
None => return iter.as_wtf8(),
}
}
iter.as_wtf8()
}
pub fn trim_end_matches(&self, f: impl Fn(CodePoint) -> bool) -> &Self {
let mut iter = self.code_points();
loop {
let old = iter.clone();
match iter.next_back().map(&f) {
Some(true) => continue,
Some(false) => {
iter = old;
break;
}
None => return iter.as_wtf8(),
}
}
iter.as_wtf8()
}
pub fn trim_matches(&self, f: impl Fn(CodePoint) -> bool) -> &Self {
self.trim_start_matches(&f).trim_end_matches(&f)
}
pub fn find(&self, pat: &Wtf8) -> Option<usize> {
memchr::memmem::find(self.as_bytes(), pat.as_bytes())
}
pub fn rfind(&self, pat: &Wtf8) -> Option<usize> {
memchr::memmem::rfind(self.as_bytes(), pat.as_bytes())
}
pub fn find_iter(&self, pat: &Wtf8) -> impl Iterator<Item = usize> {
memchr::memmem::find_iter(self.as_bytes(), pat.as_bytes())
}
pub fn rfind_iter(&self, pat: &Wtf8) -> impl Iterator<Item = usize> {
memchr::memmem::rfind_iter(self.as_bytes(), pat.as_bytes())
}
pub fn contains(&self, pat: &Wtf8) -> bool {
self.bytes.contains_str(pat)
}
pub fn contains_code_point(&self, pat: CodePoint) -> bool {
self.bytes
.contains_str(pat.encode_wtf8(&mut [0; MAX_LEN_UTF8]))
}
pub fn get(&self, range: impl ops::RangeBounds<usize>) -> Option<&Self> {
let start = match range.start_bound() {
ops::Bound::Included(&i) => i,
ops::Bound::Excluded(&i) => i.saturating_add(1),
ops::Bound::Unbounded => 0,
};
let end = match range.end_bound() {
ops::Bound::Included(&i) => i.saturating_add(1),
ops::Bound::Excluded(&i) => i,
ops::Bound::Unbounded => self.len(),
};
if start <= end && is_code_point_boundary(self, start) && is_code_point_boundary(self, end)
{
Some(unsafe { slice_unchecked(self, start, end) })
} else {
None
}
}
pub fn ends_with(&self, w: impl AsRef<Wtf8>) -> bool {
self.bytes.ends_with_str(w.as_ref())
}
pub fn starts_with(&self, w: impl AsRef<Wtf8>) -> bool {
self.bytes.starts_with_str(w.as_ref())
}
pub fn strip_prefix(&self, w: impl AsRef<Wtf8>) -> Option<&Self> {
self.bytes
.strip_prefix(w.as_ref().as_bytes())
.map(|w| unsafe { Wtf8::from_bytes_unchecked(w) })
}
pub fn strip_suffix(&self, w: impl AsRef<Wtf8>) -> Option<&Self> {
self.bytes
.strip_suffix(w.as_ref().as_bytes())
.map(|w| unsafe { Wtf8::from_bytes_unchecked(w) })
}
pub fn replace(&self, from: &Wtf8, to: &Wtf8) -> Wtf8Buf {
let w = self.bytes.replace(from, to);
unsafe { Wtf8Buf::from_bytes_unchecked(w) }
}
pub fn replacen(&self, from: &Wtf8, to: &Wtf8, n: usize) -> Wtf8Buf {
let w = self.bytes.replacen(from, to, n);
unsafe { Wtf8Buf::from_bytes_unchecked(w) }
}
}
impl AsRef<Wtf8> for str {
fn as_ref(&self) -> &Wtf8 {
unsafe { Wtf8::from_bytes_unchecked(self.as_bytes()) }
}
}
impl AsRef<[u8]> for Wtf8 {
fn as_ref(&self) -> &[u8] {
self.as_bytes()
}
}
impl ops::Index<ops::Range<usize>> for Wtf8 {
type Output = Wtf8;
#[inline]
#[track_caller]
fn index(&self, range: ops::Range<usize>) -> &Wtf8 {
if range.start <= range.end
&& is_code_point_boundary(self, range.start)
&& is_code_point_boundary(self, range.end)
{
unsafe { slice_unchecked(self, range.start, range.end) }
} else {
slice_error_fail(self, range.start, range.end)
}
}
}
impl ops::Index<ops::RangeFrom<usize>> for Wtf8 {
type Output = Wtf8;
#[inline]
#[track_caller]
fn index(&self, range: ops::RangeFrom<usize>) -> &Wtf8 {
if is_code_point_boundary(self, range.start) {
unsafe { slice_unchecked(self, range.start, self.len()) }
} else {
slice_error_fail(self, range.start, self.len())
}
}
}
impl ops::Index<ops::RangeTo<usize>> for Wtf8 {
type Output = Wtf8;
#[inline]
#[track_caller]
fn index(&self, range: ops::RangeTo<usize>) -> &Wtf8 {
if is_code_point_boundary(self, range.end) {
unsafe { slice_unchecked(self, 0, range.end) }
} else {
slice_error_fail(self, 0, range.end)
}
}
}
impl ops::Index<ops::RangeFull> for Wtf8 {
type Output = Wtf8;
#[inline]
fn index(&self, _range: ops::RangeFull) -> &Wtf8 {
self
}
}
#[inline]
const fn decode_surrogate(second_byte: u8, third_byte: u8) -> u16 {
0xD800 | (second_byte as u16 & 0x3F) << 6 | third_byte as u16 & 0x3F
}
#[inline]
const fn decode_surrogate_pair(lead: u16, trail: u16) -> char {
let code_point = 0x10000 + ((((lead - 0xD800) as u32) << 10) | (trail - 0xDC00) as u32);
unsafe { char::from_u32_unchecked(code_point) }
}
#[inline]
fn is_code_point_boundary(slice: &Wtf8, index: usize) -> bool {
if index == 0 {
return true;
}
match slice.bytes.get(index) {
None => index == slice.len(),
Some(&b) => (b as i8) >= -0x40,
}
}
#[track_caller]
#[inline]
pub fn check_utf8_boundary(slice: &Wtf8, index: usize) {
if index == 0 {
return;
}
match slice.bytes.get(index) {
Some(0xED) => (), Some(&b) if (b as i8) >= -0x40 => return,
Some(_) => panic!("byte index {index} is not a codepoint boundary"),
None if index == slice.len() => return,
None => panic!("byte index {index} is out of bounds"),
}
if slice.bytes[index + 1] >= 0xA0 {
if index >= 3 && slice.bytes[index - 3] == 0xED && slice.bytes[index - 2] >= 0xA0 {
panic!("byte index {index} lies between surrogate codepoints");
}
}
}
#[inline]
pub const unsafe fn slice_unchecked(s: &Wtf8, begin: usize, end: usize) -> &Wtf8 {
unsafe {
let len = end - begin;
let start = s.as_bytes().as_ptr().add(begin);
Wtf8::from_bytes_unchecked(slice::from_raw_parts(start, len))
}
}
#[inline(never)]
#[track_caller]
pub fn slice_error_fail(s: &Wtf8, begin: usize, end: usize) -> ! {
assert!(begin <= end);
panic!("index {begin} and/or {end} in `{s:?}` do not lie on character boundary");
}
#[derive(Clone)]
pub struct Wtf8CodePoints<'a> {
bytes: slice::Iter<'a, u8>,
}
impl Iterator for Wtf8CodePoints<'_> {
type Item = CodePoint;
#[inline]
fn next(&mut self) -> Option<CodePoint> {
unsafe { next_code_point(&mut self.bytes).map(|c| CodePoint { value: c }) }
}
#[inline]
fn size_hint(&self) -> (usize, Option<usize>) {
let len = self.bytes.len();
(len.saturating_add(3) / 4, Some(len))
}
fn last(mut self) -> Option<Self::Item> {
self.next_back()
}
fn count(self) -> usize {
core_str_count::count_chars(self.as_wtf8())
}
}
impl DoubleEndedIterator for Wtf8CodePoints<'_> {
#[inline]
fn next_back(&mut self) -> Option<CodePoint> {
unsafe {
next_code_point_reverse(&mut self.bytes).map(|ch| CodePoint::from_u32_unchecked(ch))
}
}
}
impl<'a> Wtf8CodePoints<'a> {
pub fn as_wtf8(&self) -> &'a Wtf8 {
unsafe { Wtf8::from_bytes_unchecked(self.bytes.as_slice()) }
}
}
#[derive(Clone)]
pub struct Wtf8CodePointIndices<'a> {
front_offset: usize,
iter: Wtf8CodePoints<'a>,
}
impl Iterator for Wtf8CodePointIndices<'_> {
type Item = (usize, CodePoint);
#[inline]
fn next(&mut self) -> Option<(usize, CodePoint)> {
let pre_len = self.iter.bytes.len();
match self.iter.next() {
None => None,
Some(ch) => {
let index = self.front_offset;
let len = self.iter.bytes.len();
self.front_offset += pre_len - len;
Some((index, ch))
}
}
}
#[inline]
fn size_hint(&self) -> (usize, Option<usize>) {
self.iter.size_hint()
}
#[inline]
fn last(mut self) -> Option<(usize, CodePoint)> {
self.next_back()
}
#[inline]
fn count(self) -> usize {
self.iter.count()
}
}
impl DoubleEndedIterator for Wtf8CodePointIndices<'_> {
#[inline]
fn next_back(&mut self) -> Option<(usize, CodePoint)> {
self.iter.next_back().map(|ch| {
let index = self.front_offset + self.iter.bytes.len();
(index, ch)
})
}
}
impl FusedIterator for Wtf8CodePointIndices<'_> {}
#[derive(Clone)]
pub struct EncodeWide<'a> {
code_points: Wtf8CodePoints<'a>,
extra: u16,
}
impl Iterator for EncodeWide<'_> {
type Item = u16;
#[inline]
fn next(&mut self) -> Option<u16> {
if self.extra != 0 {
let tmp = self.extra;
self.extra = 0;
return Some(tmp);
}
let mut buf = [0; MAX_LEN_UTF16];
self.code_points.next().map(|code_point| {
let n = encode_utf16_raw(code_point.value, &mut buf).len();
if n == 2 {
self.extra = buf[1];
}
buf[0]
})
}
#[inline]
fn size_hint(&self) -> (usize, Option<usize>) {
let (low, high) = self.code_points.size_hint();
let ext = (self.extra != 0) as usize;
(
low + ext,
high.and_then(|n| n.checked_mul(2))
.and_then(|n| n.checked_add(ext)),
)
}
}
impl FusedIterator for EncodeWide<'_> {}
pub struct Wtf8Chunks<'a> {
wtf8: &'a Wtf8,
}
impl<'a> Iterator for Wtf8Chunks<'a> {
type Item = Wtf8Chunk<'a>;
fn next(&mut self) -> Option<Self::Item> {
match self.wtf8.next_surrogate(0) {
Some((0, surrogate)) => {
self.wtf8 = &self.wtf8[3..];
Some(Wtf8Chunk::Surrogate(surrogate.into()))
}
Some((n, _)) => {
let s = unsafe { str::from_utf8_unchecked(&self.wtf8.as_bytes()[..n]) };
self.wtf8 = &self.wtf8[n..];
Some(Wtf8Chunk::Utf8(s))
}
None => {
let s =
unsafe { str::from_utf8_unchecked(core::mem::take(&mut self.wtf8).as_bytes()) };
(!s.is_empty()).then_some(Wtf8Chunk::Utf8(s))
}
}
}
}
pub enum Wtf8Chunk<'a> {
Utf8(&'a str),
Surrogate(CodePoint),
}
impl Hash for CodePoint {
#[inline]
fn hash<H: Hasher>(&self, state: &mut H) {
self.value.hash(state)
}
}
pub unsafe fn from_boxed_wtf8_unchecked(value: Box<[u8]>) -> Box<Wtf8> {
unsafe { Box::from_raw(Box::into_raw(value) as *mut Wtf8) }
}
impl Clone for Box<Wtf8> {
fn clone(&self) -> Self {
(&**self).into()
}
}
impl Default for Box<Wtf8> {
fn default() -> Self {
unsafe { from_boxed_wtf8_unchecked(Box::default()) }
}
}
impl From<&Wtf8> for Box<Wtf8> {
fn from(w: &Wtf8) -> Self {
w.into_box()
}
}
impl<'a> From<&'a str> for &'a Wtf8 {
#[inline]
fn from(s: &'a str) -> &'a Wtf8 {
unsafe { Wtf8::from_bytes_unchecked(s.as_bytes()) }
}
}
impl From<&str> for Box<Wtf8> {
fn from(s: &str) -> Self {
Box::<str>::from(s).into()
}
}
impl From<Box<str>> for Box<Wtf8> {
fn from(s: Box<str>) -> Self {
unsafe { from_boxed_wtf8_unchecked(s.into_boxed_bytes()) }
}
}
impl From<Box<ascii::AsciiStr>> for Box<Wtf8> {
fn from(s: Box<ascii::AsciiStr>) -> Self {
<Box<str>>::from(s).into()
}
}
impl From<Box<Wtf8>> for Box<[u8]> {
fn from(w: Box<Wtf8>) -> Self {
unsafe { Box::from_raw(Box::into_raw(w) as *mut [u8]) }
}
}
impl From<Wtf8Buf> for Box<Wtf8> {
fn from(w: Wtf8Buf) -> Self {
w.into_box()
}
}
impl From<Box<Wtf8>> for Wtf8Buf {
fn from(w: Box<Wtf8>) -> Self {
Wtf8Buf::from_box(w)
}
}
impl From<String> for Box<Wtf8> {
fn from(s: String) -> Self {
s.into_boxed_str().into()
}
}
mod concat;
pub use concat::Wtf8Concat;