#![no_std]
#![warn(clippy::pedantic)]
#![warn(missing_docs, clippy::missing_docs_in_private_items)]
use core::{
borrow::Borrow,
cmp::Ordering,
hash::{Hash, Hasher},
hint::assert_unchecked as assume,
ops::Deref,
};
use representation::{codepoint_len_lut, Utf8CharInner};
use std_at_home::TAG_CONTINUATION;
mod charapi;
pub mod iter;
mod representation;
mod std_at_home;
#[cfg(test)]
mod tests;
pub use iter::{Utf8CharIter, IntoUtf8Chars};
#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord)]
pub struct Utf8Char(Utf8CharInner);
impl Utf8Char {
#[must_use]
const fn codepoint_len(byte: u8) -> u8 {
codepoint_len_lut(byte)
}
#[must_use]
pub const fn len_utf8(self) -> u8 {
self.0.len_utf8() as u8
}
#[must_use]
pub const fn from_first_char(s: &str) -> Option<Self> {
let false = s.is_empty() else {
return None;
};
Some(unsafe { Self::from_first_char_unchecked(s) })
}
#[must_use]
pub const unsafe fn from_first_char_unchecked(s: &str) -> Self {
unsafe { assume(!s.is_empty()) };
let b = s.as_bytes();
let len = Self::codepoint_len(b[0]);
unsafe { assume(1 <= len && len <= 4) };
unsafe { assume(b.len() >= len as usize) };
#[expect(
clippy::items_after_statements,
reason = "its placed here because its only relevant here"
)]
const PAD: u8 = TAG_CONTINUATION;
let mut out = [b[0], PAD, PAD, PAD];
if len > 1 {
unsafe { assume(b.len() > 1) }
out[1] = b[1];
}
if len > 2 {
unsafe { assume(b.len() > 2) }
out[2] = b[2];
}
if len > 3 {
unsafe { assume(b.len() > 3) }
out[3] = b[3];
}
Self(unsafe { Utf8CharInner::from_utf8char_array(out) })
}
#[must_use]
pub const fn from_char(code: char) -> Self {
let mut buf = [TAG_CONTINUATION; 4];
code.encode_utf8(&mut buf);
Self(unsafe { Utf8CharInner::from_utf8char_array(buf) })
}
#[must_use]
pub const fn to_char(self) -> char {
std_at_home::to_char(self)
}
#[must_use]
pub const fn as_str(&self) -> &str {
let len = self.len_utf8() as usize;
let slice = unsafe { self.0.as_array().split_at_unchecked(len).0 };
unsafe { core::str::from_utf8_unchecked(slice) }
}
}
impl From<char> for Utf8Char {
fn from(value: char) -> Self {
Self::from_char(value)
}
}
impl From<Utf8Char> for char {
fn from(value: Utf8Char) -> Self {
value.to_char()
}
}
impl AsRef<str> for Utf8Char {
fn as_ref(&self) -> &str {
self.as_str()
}
}
impl Borrow<str> for Utf8Char {
fn borrow(&self) -> &str {
self.as_str()
}
}
impl PartialEq<str> for Utf8Char {
fn eq(&self, other: &str) -> bool {
self.as_str().eq(other)
}
}
impl PartialEq<Utf8Char> for str {
fn eq(&self, other: &Utf8Char) -> bool {
self.eq(other.as_str())
}
}
impl PartialOrd<str> for Utf8Char {
fn partial_cmp(&self, other: &str) -> Option<Ordering> {
Some(self.as_str().cmp(other))
}
}
impl PartialOrd<Utf8Char> for str {
fn partial_cmp(&self, other: &Utf8Char) -> Option<Ordering> {
Some(self.cmp(other.as_str()))
}
}
impl Hash for Utf8Char {
fn hash<H: Hasher>(&self, state: &mut H) {
self.as_str().hash(state);
}
}
impl Deref for Utf8Char {
type Target = str;
fn deref(&self) -> &str {
self.as_str()
}
}
#[cfg(test)]
extern crate alloc;
#[test]
fn roundtrip() {
use rayon::iter::ParallelIterator;
tests::all_chars().for_each(|ch| {
let mut buf = [TAG_CONTINUATION; 4];
let s = ch.encode_utf8(&mut buf);
let utf8_alt = Utf8Char::from_first_char(s).unwrap();
let utf8 = Utf8Char::from_char(ch);
let codelen = Utf8Char::codepoint_len(utf8.0.first_byte().0 as u8);
assert!(matches!(codelen, 1..=4));
assert_eq!(codelen as usize, ch.len_utf8());
assert_eq!(utf8.len_utf8() as usize, ch.len_utf8());
assert_eq!(s, &*utf8);
assert_eq!(&*utf8_alt, s);
assert_eq!(&buf, utf8.0.as_array());
assert_eq!(&buf, utf8_alt.0.as_array());
assert_eq!(utf8.to_char(), ch);
assert_eq!(utf8_alt.to_char(), ch);
})
}
#[test]
fn empty_string() {
assert!(Utf8Char::from_first_char("").is_none());
}
#[test]
fn displays() {
use alloc::string::String;
use core::{fmt::Write, write};
let test_char = |(bufutf8, bufutf32): &mut (String, String), utf32| {
let utf8 = Utf8Char::from_char(utf32);
bufutf8.clear();
bufutf32.clear();
write!(bufutf8, "{utf8}").unwrap();
write!(bufutf32, "{utf32}").unwrap();
assert_eq!(bufutf8, bufutf32);
assert_eq!(bufutf8, utf8.as_str());
assert_eq!(Utf8Char::from_first_char(&bufutf8), Some(utf8));
assert_eq!(
unsafe { Utf8Char::from_first_char_unchecked(&bufutf8) },
utf8
);
bufutf8.clear();
bufutf32.clear();
write!(bufutf8, "{utf8:?}").unwrap();
write!(bufutf32, "{utf32:?}").unwrap();
assert_eq!(bufutf8, bufutf32);
};
#[cfg(not(miri))]
{
use rayon::iter::ParallelIterator;
tests::all_chars().for_each_with((String::new(), String::new()), test_char);
}
#[cfg(miri)]
{
let mut bufs = (String::new(), String::new());
tests::all_chars().for_each(|ch| test_char(&mut bufs, ch));
}
}
#[test]
fn total_ordering() {
use rayon::iter::ParallelIterator;
tests::all_char_pairs().for_each(|(a, b)| {
let a_utf8 = Utf8Char::from_char(a);
let b_utf8 = Utf8Char::from_char(b);
assert_eq!(a_utf8.cmp(&b_utf8), a.cmp(&b));
assert_eq!(
a_utf8.eq_ignore_ascii_case(b_utf8),
a.eq_ignore_ascii_case(&b)
);
});
}