use alloc::string::String;
use alloc::vec::Vec;
const TWO_BYTE_MIN: u32 = 0x80;
const THREE_BYTE_MIN: u32 = 0x800;
const FOUR_BYTE_MIN: u32 = 0x1_0000;
const CODE_POINT_MAX: u32 = 0x10_FFFF;
const SURROGATE_MIN: u32 = 0xD800;
const SURROGATE_MAX: u32 = 0xDFFF;
const HIGH_SURROGATE_MAX: u32 = 0xDBFF;
const LOW_SURROGATE_MIN: u32 = 0xDC00;
#[must_use]
#[inline]
pub const fn is_surrogate(cp: u32) -> bool {
cp >= SURROGATE_MIN && cp <= SURROGATE_MAX
}
pub fn encode_code_point(cp: u32, out: &mut Vec<u8>) {
let cp = if cp > CODE_POINT_MAX { 0xFFFD } else { cp };
if cp < TWO_BYTE_MIN {
out.push(cp as u8);
} else if cp < THREE_BYTE_MIN {
out.push((0xC0 | (cp >> 6)) as u8);
out.push((0x80 | (cp & 0x3F)) as u8);
} else if cp < FOUR_BYTE_MIN {
out.push((0xE0 | (cp >> 12)) as u8);
out.push((0x80 | ((cp >> 6) & 0x3F)) as u8);
out.push((0x80 | (cp & 0x3F)) as u8);
} else {
out.push((0xF0 | (cp >> 18)) as u8);
out.push((0x80 | ((cp >> 12) & 0x3F)) as u8);
out.push((0x80 | ((cp >> 6) & 0x3F)) as u8);
out.push((0x80 | (cp & 0x3F)) as u8);
}
}
pub fn encode_utf16_unit(u: u16, out: &mut Vec<u8>) {
encode_code_point(u32::from(u), out);
}
#[must_use]
pub fn from_utf16(units: &[u16]) -> Vec<u8> {
let mut out = Vec::with_capacity(units.len());
let mut i = 0;
while i < units.len() {
let u = u32::from(units[i]);
if (SURROGATE_MIN..=HIGH_SURROGATE_MAX).contains(&u) && i + 1 < units.len() {
let lo = u32::from(units[i + 1]);
if (LOW_SURROGATE_MIN..=SURROGATE_MAX).contains(&lo) {
let cp = 0x1_0000 + ((u - SURROGATE_MIN) << 10) + (lo - LOW_SURROGATE_MIN);
encode_code_point(cp, &mut out);
i += 2;
continue;
}
}
encode_code_point(u, &mut out);
i += 1;
}
out
}
fn decode_code_point(bytes: &[u8], i: usize) -> (u32, usize) {
let b0 = bytes[i];
if b0 < 0x80 {
return (u32::from(b0), 1);
}
let cont = |k: usize| -> u32 { u32::from(bytes[i + k] & 0x3F) };
if b0 < 0xE0 {
if i + 1 < bytes.len() {
return ((u32::from(b0 & 0x1F) << 6) | cont(1), 2);
}
} else if b0 < 0xF0 {
if i + 2 < bytes.len() {
return ((u32::from(b0 & 0x0F) << 12) | (cont(1) << 6) | cont(2), 3);
}
} else if i + 3 < bytes.len() {
return (
(u32::from(b0 & 0x07) << 18) | (cont(1) << 12) | (cont(2) << 6) | cont(3),
4,
);
}
(0xFFFD, 1)
}
pub fn code_points(bytes: &[u8]) -> CodePoints<'_> {
CodePoints { bytes, i: 0 }
}
#[derive(Clone)]
pub struct CodePoints<'a> {
bytes: &'a [u8],
i: usize,
}
impl Iterator for CodePoints<'_> {
type Item = u32;
fn next(&mut self) -> Option<u32> {
if self.i >= self.bytes.len() {
return None;
}
let (cp, n) = decode_code_point(self.bytes, self.i);
self.i += n;
Some(cp)
}
}
pub fn utf16_units(bytes: &[u8]) -> Utf16Units<'_> {
Utf16Units {
inner: code_points(bytes),
pending_low: None,
}
}
#[derive(Clone)]
pub struct Utf16Units<'a> {
inner: CodePoints<'a>,
pending_low: Option<u16>,
}
impl Iterator for Utf16Units<'_> {
type Item = u16;
fn next(&mut self) -> Option<u16> {
if let Some(lo) = self.pending_low.take() {
return Some(lo);
}
let cp = self.inner.next()?;
if cp >= FOUR_BYTE_MIN {
let v = cp - FOUR_BYTE_MIN;
let high = SURROGATE_MIN + (v >> 10);
let low = LOW_SURROGATE_MIN + (v & 0x3FF);
self.pending_low = Some(low as u16);
Some(high as u16)
} else {
Some(cp as u16)
}
}
}
#[must_use]
pub fn utf16_len(bytes: &[u8]) -> usize {
let mut len = 0;
let mut i = 0;
while i < bytes.len() {
let (cp, n) = decode_code_point(bytes, i);
len += if cp >= FOUR_BYTE_MIN { 2 } else { 1 };
i += n;
}
len
}
#[must_use]
pub fn utf16_index(bytes: &[u8], i: usize) -> Option<u16> {
utf16_units(bytes).nth(i)
}
fn byte_offset_of_unit(bytes: &[u8], unit: usize, round_up: bool) -> usize {
if unit == 0 {
return 0;
}
let mut units_seen = 0;
let mut i = 0;
while i < bytes.len() {
if units_seen >= unit {
return i;
}
let (cp, n) = decode_code_point(bytes, i);
let w = if cp >= FOUR_BYTE_MIN { 2 } else { 1 };
if units_seen + w > unit {
return if round_up { i + n } else { i };
}
units_seen += w;
i += n;
}
bytes.len()
}
#[must_use]
pub fn slice_utf16(bytes: &[u8], start_unit: usize, end_unit: usize) -> Vec<u8> {
let len = utf16_len(bytes);
let s = start_unit.min(len);
let e = end_unit.min(len);
if s >= e {
return Vec::new();
}
let start_byte = byte_offset_of_unit(bytes, s, false);
let end_byte = byte_offset_of_unit(bytes, e, true);
if start_byte >= end_byte {
return Vec::new();
}
bytes[start_byte..end_byte].to_vec()
}
#[must_use]
pub fn as_str(bytes: &[u8]) -> Option<&str> {
core::str::from_utf8(bytes).ok()
}
#[must_use]
pub fn is_utf8(bytes: &[u8]) -> bool {
core::str::from_utf8(bytes).is_ok()
}
#[must_use]
pub fn to_string_lossy(bytes: &[u8]) -> String {
if let Some(s) = as_str(bytes) {
return String::from(s);
}
let mut out = String::with_capacity(bytes.len());
for cp in code_points(bytes) {
out.push(char::from_u32(cp).unwrap_or('\u{FFFD}'));
}
out
}
#[cfg(test)]
mod tests {
use super::*;
use alloc::vec;
fn enc(cp: u32) -> Vec<u8> {
let mut v = Vec::new();
encode_code_point(cp, &mut v);
v
}
#[test]
fn bmp_scalar_is_plain_utf8() {
assert_eq!(enc(u32::from(b'A')), b"A");
assert_eq!(enc(0xE9), "é".as_bytes()); assert_eq!(enc(0x4E2D), "中".as_bytes()); for s in ["A", "é", "中", "hello world"] {
assert_eq!(as_str(s.as_bytes()), Some(s));
assert_eq!(utf16_len(s.as_bytes()), s.encode_utf16().count());
}
}
#[test]
fn astral_is_one_code_point_two_units() {
let grin = "😀";
let bytes = grin.as_bytes().to_vec();
assert_eq!(enc(0x1_F600), bytes);
assert_eq!(utf16_len(&bytes), 2);
let units: Vec<u16> = utf16_units(&bytes).collect();
assert_eq!(units, vec![0xD83D, 0xDE00]);
let cps: Vec<u32> = code_points(&bytes).collect();
assert_eq!(cps, vec![0x1_F600]);
}
#[test]
fn lone_high_surrogate_round_trips() {
let bytes = from_utf16(&[0xD800]);
assert_eq!(bytes, vec![0xED, 0xA0, 0x80]); assert_eq!(utf16_len(&bytes), 1);
assert_eq!(utf16_index(&bytes, 0), Some(0xD800));
let units: Vec<u16> = utf16_units(&bytes).collect();
assert_eq!(units, vec![0xD800]);
assert_eq!(from_utf16(&units), bytes);
assert_eq!(as_str(&bytes), None);
let cps: Vec<u32> = code_points(&bytes).collect();
assert_eq!(cps, vec![0xD800]);
}
#[test]
fn lone_low_surrogate_round_trips() {
let bytes = from_utf16(&[0xDC00]);
assert_eq!(bytes, vec![0xED, 0xB0, 0x80]);
assert_eq!(utf16_len(&bytes), 1);
assert_eq!(utf16_index(&bytes, 0), Some(0xDC00));
assert_eq!(from_utf16(&utf16_units(&bytes).collect::<Vec<_>>()), bytes);
}
#[test]
fn from_utf16_pairs_adjacent_surrogates() {
let bytes = from_utf16(&[0xD83D, 0xDE00]);
assert_eq!(bytes, "😀".as_bytes());
let mixed = from_utf16(&[0xD800, u32::from(b'x') as u16]);
assert_eq!(mixed, [&[0xED, 0xA0, 0x80][..], b"x"].concat());
assert_eq!(utf16_len(&mixed), 2);
let lone_low_first = from_utf16(&[0xDC00, 0x41]);
assert_eq!(lone_low_first, [&[0xED, 0xB0, 0x80][..], b"A"].concat());
}
#[test]
fn encode_utf16_unit_matches_code_point() {
let mut a = Vec::new();
encode_utf16_unit(0xD800, &mut a);
assert_eq!(a, enc(0xD800));
let mut b = Vec::new();
encode_utf16_unit(0x41, &mut b);
assert_eq!(b, b"A");
}
#[test]
fn slice_across_astral_char_is_boundary_correct() {
let bytes = from_utf16(&[0x61, 0xD83D, 0xDE00, 0x62]);
assert_eq!(bytes, "a😀b".as_bytes());
assert_eq!(utf16_len(&bytes), 4);
assert_eq!(slice_utf16(&bytes, 1, 3), "😀".as_bytes());
assert_eq!(slice_utf16(&bytes, 0, 1), b"a");
assert_eq!(slice_utf16(&bytes, 3, 4), b"b");
assert_eq!(slice_utf16(&bytes, 1, 2), "😀".as_bytes());
assert_eq!(slice_utf16(&bytes, 2, 3), "😀".as_bytes());
assert_eq!(slice_utf16(&bytes, 3, 1), b"");
assert_eq!(slice_utf16(&bytes, 10, 20), b"");
assert_eq!(slice_utf16(&bytes, 0, 4), bytes);
}
#[test]
fn slice_preserves_lone_surrogates() {
let bytes = from_utf16(&[0x78, 0xD800, 0x79]);
assert_eq!(utf16_len(&bytes), 3);
let mid = slice_utf16(&bytes, 1, 2);
assert_eq!(mid, vec![0xED, 0xA0, 0x80]);
assert_eq!(utf16_index(&mid, 0), Some(0xD800));
}
#[test]
fn to_string_lossy_replaces_surrogates() {
let bytes = from_utf16(&[0x61, 0xD800, 0x62]);
assert_eq!(to_string_lossy(&bytes), "a\u{FFFD}b");
assert_eq!(to_string_lossy("héllo".as_bytes()), "héllo");
assert_eq!(to_string_lossy("😀".as_bytes()), "😀");
}
#[test]
fn utf16_index_out_of_range() {
let bytes = from_utf16(&[0x61, 0xD83D, 0xDE00]);
assert_eq!(utf16_index(&bytes, 0), Some(0x61));
assert_eq!(utf16_index(&bytes, 1), Some(0xD83D));
assert_eq!(utf16_index(&bytes, 2), Some(0xDE00));
assert_eq!(utf16_index(&bytes, 3), None);
}
#[test]
fn over_max_code_point_clamps() {
assert_eq!(enc(0x11_0000), "\u{FFFD}".as_bytes());
}
#[test]
fn is_surrogate_predicate() {
assert!(is_surrogate(0xD800));
assert!(is_surrogate(0xDFFF));
assert!(!is_surrogate(0xD7FF));
assert!(!is_surrogate(0xE000));
assert!(!is_surrogate(0x1_F600));
}
#[test]
fn empty_slice() {
assert_eq!(utf16_len(b""), 0);
assert_eq!(utf16_units(b"").count(), 0);
assert_eq!(code_points(b"").count(), 0);
assert_eq!(slice_utf16(b"", 0, 0), b"");
assert_eq!(as_str(b""), Some(""));
}
}