use alloc::borrow::Cow;
use alloc::string::String;
use crate::ccc::CccBuffer;
use crate::compose;
use crate::decompose::{self, DecompForm};
use crate::hangul;
use crate::quick_check;
use crate::simd;
use crate::simd::prefetch;
use crate::tables;
use crate::utf8;
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum Form {
Nfc,
Nfd,
Nfkc,
Nfkd,
}
impl Form {
#[inline]
fn passthrough_bound(self) -> u8 {
match self {
Form::Nfc | Form::Nfkc => 0xC0,
Form::Nfd | Form::Nfkd => 0xC0,
}
}
#[inline]
fn composes(self) -> bool {
matches!(self, Form::Nfc | Form::Nfkc)
}
#[inline]
fn decomp_form(self) -> DecompForm {
match self {
Form::Nfc | Form::Nfd => DecompForm::Canonical,
Form::Nfkc | Form::Nfkd => DecompForm::Compatible,
}
}
#[inline]
fn estimated_capacity(self, input_len: usize) -> usize {
match self {
Form::Nfc | Form::Nfkc => input_len,
Form::Nfd | Form::Nfkd => input_len + input_len / 2,
}
}
#[inline]
fn quick_check(self, input: &str) -> quick_check::IsNormalized {
match self {
Form::Nfc => quick_check::quick_check_nfc(input),
Form::Nfd => quick_check::quick_check_nfd(input),
Form::Nfkc => quick_check::quick_check_nfkc(input),
Form::Nfkd => quick_check::quick_check_nfkd(input),
}
}
}
struct NormState {
current_starter: Option<char>,
ccc_buf: CccBuffer,
}
impl NormState {
#[inline]
fn new() -> Self {
NormState {
current_starter: None,
ccc_buf: CccBuffer::new(),
}
}
#[inline]
fn flush(&mut self, out: &mut String, composes: bool) {
let starter = match self.current_starter.take() {
Some(s) => s,
None => {
if !self.ccc_buf.is_empty() {
self.ccc_buf.sort_in_place();
for entry in self.ccc_buf.as_slice() {
out.push(entry.ch);
}
self.ccc_buf.clear();
}
return;
},
};
if self.ccc_buf.is_empty() {
out.push(starter);
return;
}
self.ccc_buf.sort_in_place();
if composes {
compose::compose_combining_sequence_into(starter, self.ccc_buf.as_slice(), out);
} else {
out.push(starter);
for entry in self.ccc_buf.as_slice() {
out.push(entry.ch);
}
}
self.ccc_buf.clear();
}
#[inline]
fn feed_entry(&mut self, ch: char, ccc: u8, out: &mut String, composes: bool) {
if ccc == 0 {
if composes && self.ccc_buf.is_empty() {
if let Some(prev) = self.current_starter
&& let Some(composed) = compose::compose(prev, ch)
{
self.current_starter = Some(composed);
return;
}
}
self.flush(out, composes);
self.current_starter = Some(ch);
} else {
self.ccc_buf.push(ch, ccc);
}
}
#[inline]
fn flush_nfd(&mut self, out: &mut String) {
let starter = match self.current_starter.take() {
Some(s) => s,
None => {
if !self.ccc_buf.is_empty() {
self.ccc_buf.sort_in_place();
for entry in self.ccc_buf.as_slice() {
out.push(entry.ch);
}
self.ccc_buf.clear();
}
return;
},
};
if let Some(entry) = self.ccc_buf.take_single_inline() {
out.push(starter);
out.push(entry.ch);
return;
}
if self.ccc_buf.is_empty() {
out.push(starter);
return;
}
self.ccc_buf.sort_in_place();
out.push(starter);
for entry in self.ccc_buf.as_slice() {
out.push(entry.ch);
}
self.ccc_buf.clear();
}
#[inline]
fn feed_entry_nfd(&mut self, ch: char, ccc: u8, out: &mut String) {
if ccc == 0 {
self.flush_nfd(out);
self.current_starter = Some(ch);
} else {
self.ccc_buf.push(ch, ccc);
}
}
}
#[inline(always)]
fn is_cjk_unified(cp: u32) -> bool {
(0x4E00..=0x9FFF).contains(&cp) || (0x3400..=0x4DBF).contains(&cp)
}
const LATIN1_SELF_MAPPING: (u8, u16, u8) = (0, 0, 0);
#[rustfmt::skip]
static LATIN1_NFD_TABLE: [(u8, u16, u8); 0x40] = [
(b'A', 0x0300, 230), (b'A', 0x0301, 230), (b'A', 0x0302, 230), (b'A', 0x0303, 230),
(b'A', 0x0308, 230), (b'A', 0x030A, 230), LATIN1_SELF_MAPPING, (b'C', 0x0327, 202),
(b'E', 0x0300, 230), (b'E', 0x0301, 230), (b'E', 0x0302, 230), (b'E', 0x0308, 230),
(b'I', 0x0300, 230), (b'I', 0x0301, 230), (b'I', 0x0302, 230), (b'I', 0x0308, 230),
LATIN1_SELF_MAPPING, (b'N', 0x0303, 230), (b'O', 0x0300, 230), (b'O', 0x0301, 230),
(b'O', 0x0302, 230), (b'O', 0x0303, 230), (b'O', 0x0308, 230), LATIN1_SELF_MAPPING,
LATIN1_SELF_MAPPING, (b'U', 0x0300, 230), (b'U', 0x0301, 230), (b'U', 0x0302, 230),
(b'U', 0x0308, 230), (b'Y', 0x0301, 230), LATIN1_SELF_MAPPING, LATIN1_SELF_MAPPING,
(b'a', 0x0300, 230), (b'a', 0x0301, 230), (b'a', 0x0302, 230), (b'a', 0x0303, 230),
(b'a', 0x0308, 230), (b'a', 0x030A, 230), LATIN1_SELF_MAPPING, (b'c', 0x0327, 202),
(b'e', 0x0300, 230), (b'e', 0x0301, 230), (b'e', 0x0302, 230), (b'e', 0x0308, 230),
(b'i', 0x0300, 230), (b'i', 0x0301, 230), (b'i', 0x0302, 230), (b'i', 0x0308, 230),
LATIN1_SELF_MAPPING, (b'n', 0x0303, 230), (b'o', 0x0300, 230), (b'o', 0x0301, 230),
(b'o', 0x0302, 230), (b'o', 0x0303, 230), (b'o', 0x0308, 230), LATIN1_SELF_MAPPING,
LATIN1_SELF_MAPPING, (b'u', 0x0300, 230), (b'u', 0x0301, 230), (b'u', 0x0302, 230),
(b'u', 0x0308, 230), (b'y', 0x0301, 230), LATIN1_SELF_MAPPING, (b'y', 0x0308, 230),
];
#[inline(always)]
unsafe fn latin1_supplement_nfd(bytes: *const u8, byte_pos: usize) -> Option<(u8, char, u8)> {
let b1 = unsafe { *bytes.add(byte_pos + 1) };
let idx = (b1 & 0x3F) as usize; let entry = LATIN1_NFD_TABLE[idx];
if entry.0 == 0 {
return None;
}
let mark = unsafe { char::from_u32_unchecked(entry.1 as u32) };
Some((entry.0, mark, entry.2))
}
#[inline]
fn process_char(
ch: char,
state: &mut NormState,
out: &mut String,
form: Form,
decomp_buf: &mut CccBuffer,
) {
let cp = ch as u32;
if cp >= 0x3400 && is_cjk_unified(cp) {
state.flush(out, form.composes());
state.current_starter = Some(ch);
return;
}
if hangul::is_hangul_syllable(ch) {
let (l, v, t) = hangul::decompose_hangul(ch);
state.feed_entry(l, 0, out, form.composes());
state.feed_entry(v, 0, out, form.composes());
if let Some(t_char) = t {
state.feed_entry(t_char, 0, out, form.composes());
}
return;
}
let trie_value = tables::raw_decomp_trie_value(ch, form.decomp_form());
if !tables::has_decomposition(trie_value) {
let ccc = tables::ccc_from_trie_value(trie_value);
state.feed_entry(ch, ccc, out, form.composes());
return;
}
decomp_buf.clear();
decompose::decompose_from_trie_value(ch, trie_value, decomp_buf, form.decomp_form());
for entry in decomp_buf.as_slice() {
state.feed_entry(entry.ch, entry.ccc, out, form.composes());
}
}
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
enum DecompKind {
None,
Canonical,
Compat,
}
struct DecodedCodepoint {
cp: u32,
cp_len: u8,
ccc: u8,
decomp_kind: DecompKind,
decomp: &'static [u32],
tv: u32,
}
#[inline(always)]
unsafe fn decode_at(bytes: *const u8, idx: usize, len: usize, form: Form) -> DecodedCodepoint {
debug_assert!(idx < len);
let b0 = unsafe { *bytes.add(idx) };
let cp_len = utf8::utf8_char_width(b0);
debug_assert!(cp_len > 0, "decode_at called on continuation/invalid byte");
debug_assert!(idx + cp_len <= len, "UTF-8 sequence runs past end of input");
let cp = match cp_len {
1 => b0 as u32,
2 => {
let b1 = unsafe { *bytes.add(idx + 1) } as u32;
((b0 as u32 & 0x1F) << 6) | (b1 & 0x3F)
},
3 => {
let b1 = unsafe { *bytes.add(idx + 1) } as u32;
let b2 = unsafe { *bytes.add(idx + 2) } as u32;
((b0 as u32 & 0x0F) << 12) | ((b1 & 0x3F) << 6) | (b2 & 0x3F)
},
4 => {
let b1 = unsafe { *bytes.add(idx + 1) } as u32;
let b2 = unsafe { *bytes.add(idx + 2) } as u32;
let b3 = unsafe { *bytes.add(idx + 3) } as u32;
((b0 as u32 & 0x07) << 18) | ((b1 & 0x3F) << 12) | ((b2 & 0x3F) << 6) | (b3 & 0x3F)
},
_ => unsafe { core::hint::unreachable_unchecked() },
};
let decomp_form = form.decomp_form();
let tv = if cp >= 0x10000 {
unsafe { tables::raw_decomp_trie_value_supplementary(cp, decomp_form) }
} else {
let ch = unsafe { char::from_u32_unchecked(cp) };
tables::raw_decomp_trie_value(ch, decomp_form)
};
let ccc = tables::ccc_from_trie_value(tv);
let (decomp_kind, decomp) = if !tables::has_decomposition(tv) {
(DecompKind::None, &[][..])
} else {
let kind = match decomp_form {
DecompForm::Canonical => DecompKind::Canonical,
DecompForm::Compatible => DecompKind::Compat,
};
let slice = tables::expansion_data_from_trie_value(tv, decomp_form).unwrap_or(&[]);
(kind, slice)
};
DecodedCodepoint {
cp,
cp_len: cp_len as u8,
ccc,
decomp_kind,
decomp,
tv,
}
}
#[inline(always)]
fn feed_expansion(decomp: &'static [u32], state: &mut NormState, out: &mut String, composes: bool) {
if !composes && decomp.len() == 2 {
let e0 = decomp[0];
let ccc0 = (e0 >> tables::EXPANSION_CCC_SHIFT) as u8;
if ccc0 == 0 {
state.flush_nfd(out);
let cp0 = e0 & tables::EXPANSION_CP_MASK;
debug_assert!(cp0 <= 0x10FFFF && !(0xD800..=0xDFFF).contains(&cp0));
state.current_starter = Some(unsafe { char::from_u32_unchecked(cp0) });
let e1 = decomp[1];
let cp1 = e1 & tables::EXPANSION_CP_MASK;
let ccc1 = (e1 >> tables::EXPANSION_CCC_SHIFT) as u8;
debug_assert!(cp1 <= 0x10FFFF && !(0xD800..=0xDFFF).contains(&cp1));
let ch1 = unsafe { char::from_u32_unchecked(cp1) };
if ccc1 != 0 {
state.ccc_buf.push(ch1, ccc1);
} else {
state.feed_entry_nfd(ch1, 0, out);
}
return;
}
}
for &entry in decomp {
let cp = entry & tables::EXPANSION_CP_MASK;
let ccc = (entry >> tables::EXPANSION_CCC_SHIFT) as u8;
debug_assert!(cp <= 0x10FFFF && !(0xD800..=0xDFFF).contains(&cp));
let exp_ch = unsafe { char::from_u32_unchecked(cp) };
if composes {
state.feed_entry(exp_ch, ccc, out, true);
} else {
state.feed_entry_nfd(exp_ch, ccc, out);
}
}
}
#[cold]
#[inline(never)]
fn feed_singleton(tv: u32, state: &mut NormState, out: &mut String, composes: bool) {
let info = tv & 0xFFFF;
debug_assert!(info <= 0xD7FF || (0xE000..=0xFFFF).contains(&info));
let decomposed = unsafe { char::from_u32_unchecked(info) };
let ccc = if info <= 0x7F {
0
} else {
tables::lookup_ccc(decomposed)
};
if composes {
state.feed_entry(decomposed, ccc, out, true);
} else {
state.feed_entry_nfd(decomposed, ccc, out);
}
}
#[inline]
fn feed_combining_mark(ch: char, ccc: u8, state: &mut NormState, out: &mut String, composes: bool) {
if composes {
state.feed_entry(ch, ccc, out, true);
} else {
state.feed_entry_nfd(ch, ccc, out);
}
}
#[inline(always)]
fn process_codepoint(dc: &DecodedCodepoint, state: &mut NormState, out: &mut String, form: Form) {
let composes = form.composes();
match dc.decomp_kind {
DecompKind::None => {
let ch = unsafe { char::from_u32_unchecked(dc.cp) };
if dc.ccc == 0 {
if composes {
state.feed_entry(ch, 0, out, true);
} else {
state.feed_entry_nfd(ch, 0, out);
}
} else {
feed_combining_mark(ch, dc.ccc, state, out, composes);
}
},
DecompKind::Canonical | DecompKind::Compat => {
if !dc.decomp.is_empty() {
feed_expansion(dc.decomp, state, out, composes);
} else {
feed_singleton(dc.tv, state, out, composes);
}
},
}
}
#[inline(always)]
fn flush_compose_passthrough(pass: &str, next_tv: u32, state: &mut NormState, out: &mut String) {
if tables::needs_starter_shadow(next_tv) {
let n = pass.len();
if n > 1 {
out.push_str(&pass[..n - 1]);
}
let last_ch = pass.as_bytes()[n - 1] as char;
state.feed_entry(last_ch, 0, out, true);
} else {
out.push_str(pass);
}
}
fn normalize_scalar<'a>(input: &'a str, form: Form) -> Cow<'a, str> {
if input.is_empty() {
return Cow::Borrowed(input);
}
if form.quick_check(input) == quick_check::IsNormalized::Yes {
return Cow::Borrowed(input);
}
let mut out = String::with_capacity(input.len());
let mut state = NormState::new();
let mut decomp_buf = CccBuffer::new();
for ch in input.chars() {
process_char(ch, &mut state, &mut out, form, &mut decomp_buf);
}
state.flush(&mut out, form.composes());
if out == input {
Cow::Borrowed(input)
} else {
Cow::Owned(out)
}
}
#[inline]
fn normalize_impl<'a>(input: &'a str, form: Form) -> Cow<'a, str> {
let bytes = input.as_bytes();
let len = bytes.len();
if len < 64 {
return normalize_scalar(input, form);
}
let qc = form.quick_check(input);
if qc == quick_check::IsNormalized::Yes {
return Cow::Borrowed(input);
}
let bound = form.passthrough_bound();
let composes = form.composes();
let mut out = String::with_capacity(form.estimated_capacity(len));
let mut last_written: usize = 0;
let mut state = NormState::new();
let mut pos: usize = 0;
let ptr = bytes.as_ptr();
macro_rules! prefetch_write_head {
($out:expr) => {
unsafe {
let write_head = $out.len();
let distance = prefetch::PREFETCH_L1_DISTANCE * prefetch::CHUNK_SIZE;
if write_head + distance <= $out.capacity() {
prefetch::prefetch_write($out.as_ptr().wrapping_add(write_head + distance));
}
}
};
}
macro_rules! process_chunk {
($chunk_start:expr, $mask:expr) => {{
let chunk_start: usize = $chunk_start;
let mask: u64 = $mask;
if mask != 0 {
let mut chunk_mask = mask;
while chunk_mask != 0 {
let bit_pos = chunk_mask.trailing_zeros() as usize;
chunk_mask &= chunk_mask.wrapping_sub(1);
let byte_pos = chunk_start + bit_pos;
if byte_pos < last_written {
continue;
}
if utf8::is_continuation_byte(bytes[byte_pos]) {
continue;
}
if !composes && bytes[byte_pos] == 0xC3 {
if let Some((starter, mark, mark_ccc)) =
unsafe { latin1_supplement_nfd(ptr, byte_pos) }
{
if byte_pos > last_written {
state.flush_nfd(&mut out);
out.push_str(&input[last_written..byte_pos]);
}
last_written = byte_pos + 2;
state.flush_nfd(&mut out);
out.push(starter as char);
state.ccc_buf.push(mark, mark_ccc);
continue;
}
}
let dc = unsafe { decode_at(ptr, byte_pos, len, form) };
let width = dc.cp_len as usize;
if !composes {
if (hangul::S_BASE..hangul::S_BASE + hangul::S_COUNT).contains(&dc.cp) {
if byte_pos > last_written {
state.flush_nfd(&mut out);
out.push_str(&input[last_written..byte_pos]);
}
last_written = byte_pos + width;
state.flush_nfd(&mut out);
let ch = unsafe { char::from_u32_unchecked(dc.cp) };
hangul::push_decomposed_hangul(ch, &mut out);
continue;
}
if dc.decomp_kind == DecompKind::None && dc.ccc == 0 {
continue;
}
if byte_pos > last_written {
state.flush_nfd(&mut out);
out.push_str(&input[last_written..byte_pos]);
}
last_written = byte_pos + width;
process_codepoint(&dc, &mut state, &mut out, form);
continue;
}
if byte_pos > last_written {
state.flush(&mut out, composes);
let pass = &input[last_written..byte_pos];
flush_compose_passthrough(pass, dc.tv, &mut state, &mut out);
}
last_written = byte_pos + width;
process_codepoint(&dc, &mut state, &mut out, form);
}
}
}};
}
while pos + 128 <= len {
let chunk_a_start = pos;
let chunk_b_start = pos + 64;
let (mask_a, mask_b) = unsafe {
let prefetch_l1 =
ptr.wrapping_add(pos + prefetch::PREFETCH_L1_DISTANCE * prefetch::CHUNK_SIZE);
let prefetch_l2 =
ptr.wrapping_add(pos + prefetch::PREFETCH_L2_DISTANCE * prefetch::CHUNK_SIZE);
simd::scan_pair_and_prefetch(
ptr.add(chunk_a_start),
ptr.add(chunk_b_start),
prefetch_l1,
prefetch_l2,
bound,
)
};
prefetch_write_head!(out);
process_chunk!(chunk_a_start, mask_a);
process_chunk!(chunk_b_start, mask_b);
pos += 128;
}
while pos + 64 <= len {
let chunk_start = pos;
let mask = unsafe {
let prefetch_l1 =
ptr.wrapping_add(pos + prefetch::PREFETCH_L1_DISTANCE * prefetch::CHUNK_SIZE);
let prefetch_l2 =
ptr.wrapping_add(pos + prefetch::PREFETCH_L2_DISTANCE * prefetch::CHUNK_SIZE);
simd::scan_and_prefetch(ptr.add(pos), prefetch_l1, prefetch_l2, bound)
};
prefetch_write_head!(out);
process_chunk!(chunk_start, mask);
pos += 64;
}
if pos < len {
let tail_has_work = bytes[pos..].iter().any(|&b| b >= bound);
if tail_has_work {
let mut tail_pos = pos;
while tail_pos < len {
if tail_pos < last_written {
tail_pos += 1;
continue;
}
if utf8::is_continuation_byte(bytes[tail_pos]) {
tail_pos += 1;
continue;
}
if !composes && bytes[tail_pos] == 0xC3 {
if let Some((starter, mark, mark_ccc)) =
unsafe { latin1_supplement_nfd(ptr, tail_pos) }
{
if tail_pos > last_written {
state.flush_nfd(&mut out);
out.push_str(&input[last_written..tail_pos]);
}
last_written = tail_pos + 2;
state.flush_nfd(&mut out);
out.push(starter as char);
state.ccc_buf.push(mark, mark_ccc);
tail_pos += 2;
continue;
}
}
let dc = unsafe { decode_at(ptr, tail_pos, len, form) };
let width = dc.cp_len as usize;
if !composes {
if (hangul::S_BASE..hangul::S_BASE + hangul::S_COUNT).contains(&dc.cp) {
if tail_pos > last_written {
state.flush_nfd(&mut out);
out.push_str(&input[last_written..tail_pos]);
}
last_written = tail_pos + width;
state.flush_nfd(&mut out);
let ch = unsafe { char::from_u32_unchecked(dc.cp) };
hangul::push_decomposed_hangul(ch, &mut out);
tail_pos += width;
continue;
}
if dc.decomp_kind == DecompKind::None && dc.ccc == 0 {
tail_pos += width;
continue;
}
if tail_pos > last_written {
state.flush_nfd(&mut out);
out.push_str(&input[last_written..tail_pos]);
}
last_written = tail_pos + width;
process_codepoint(&dc, &mut state, &mut out, form);
tail_pos += width;
continue;
}
if tail_pos > last_written {
state.flush(&mut out, composes);
let pass = &input[last_written..tail_pos];
flush_compose_passthrough(pass, dc.tv, &mut state, &mut out);
}
last_written = tail_pos + width;
process_codepoint(&dc, &mut state, &mut out, form);
tail_pos += width;
}
}
}
if composes {
state.flush(&mut out, true);
} else {
state.flush_nfd(&mut out);
}
if last_written < len {
out.push_str(&input[last_written..len]);
}
if qc == quick_check::IsNormalized::Maybe && out == input {
Cow::Borrowed(input)
} else {
Cow::Owned(out)
}
}
pub struct NfcNormalizer;
pub struct NfdNormalizer;
pub struct NfkcNormalizer;
pub struct NfkdNormalizer;
impl Default for NfcNormalizer {
fn default() -> Self {
Self::new()
}
}
impl Default for NfdNormalizer {
fn default() -> Self {
Self::new()
}
}
impl Default for NfkcNormalizer {
fn default() -> Self {
Self::new()
}
}
impl Default for NfkdNormalizer {
fn default() -> Self {
Self::new()
}
}
impl NfcNormalizer {
pub fn new() -> Self {
NfcNormalizer
}
pub fn quick_check(&self, input: &str) -> crate::quick_check::IsNormalized {
quick_check::quick_check_nfc(input)
}
pub fn normalize<'a>(&self, input: &'a str) -> Cow<'a, str> {
normalize_impl(input, Form::Nfc)
}
pub fn normalize_to(&self, input: &str, out: &mut String) -> bool {
let result = normalize_impl(input, Form::Nfc);
let already_normalized = matches!(&result, Cow::Borrowed(_));
out.push_str(&result);
already_normalized
}
pub fn is_normalized(&self, input: &str) -> bool {
quick_check::is_normalized_nfc(input)
}
}
impl NfdNormalizer {
pub fn new() -> Self {
NfdNormalizer
}
pub fn quick_check(&self, input: &str) -> crate::quick_check::IsNormalized {
quick_check::quick_check_nfd(input)
}
pub fn normalize<'a>(&self, input: &'a str) -> Cow<'a, str> {
normalize_impl(input, Form::Nfd)
}
pub fn normalize_to(&self, input: &str, out: &mut String) -> bool {
let result = normalize_impl(input, Form::Nfd);
let already_normalized = matches!(&result, Cow::Borrowed(_));
out.push_str(&result);
already_normalized
}
pub fn is_normalized(&self, input: &str) -> bool {
quick_check::is_normalized_nfd(input)
}
}
impl NfkcNormalizer {
pub fn new() -> Self {
NfkcNormalizer
}
pub fn quick_check(&self, input: &str) -> crate::quick_check::IsNormalized {
quick_check::quick_check_nfkc(input)
}
pub fn normalize<'a>(&self, input: &'a str) -> Cow<'a, str> {
normalize_impl(input, Form::Nfkc)
}
pub fn normalize_to(&self, input: &str, out: &mut String) -> bool {
let result = normalize_impl(input, Form::Nfkc);
let already_normalized = matches!(&result, Cow::Borrowed(_));
out.push_str(&result);
already_normalized
}
pub fn is_normalized(&self, input: &str) -> bool {
quick_check::is_normalized_nfkc(input)
}
}
impl NfkdNormalizer {
pub fn new() -> Self {
NfkdNormalizer
}
pub fn quick_check(&self, input: &str) -> crate::quick_check::IsNormalized {
quick_check::quick_check_nfkd(input)
}
pub fn normalize<'a>(&self, input: &'a str) -> Cow<'a, str> {
normalize_impl(input, Form::Nfkd)
}
pub fn normalize_to(&self, input: &str, out: &mut String) -> bool {
let result = normalize_impl(input, Form::Nfkd);
let already_normalized = matches!(&result, Cow::Borrowed(_));
out.push_str(&result);
already_normalized
}
pub fn is_normalized(&self, input: &str) -> bool {
quick_check::is_normalized_nfkd(input)
}
}
#[cfg(test)]
mod tests {
use super::*;
use alloc::borrow::Cow;
use alloc::string::String;
use alloc::vec::Vec;
#[test]
fn latin1_table_matches_runtime_lookup_nfd() {
for cp in 0xC0u32..=0xFF {
let ch = char::from_u32(cp).unwrap();
let mut buf = String::new();
buf.push(ch);
let general: Cow<'_, str> = normalize_impl(&buf, Form::Nfd);
let entry = LATIN1_NFD_TABLE[(cp - 0xC0) as usize];
let mut fast = String::new();
if entry.0 == 0 {
fast.push(ch);
} else {
fast.push(entry.0 as char);
fast.push(char::from_u32(entry.1 as u32).unwrap());
}
assert_eq!(
&*general, fast,
"NFD mismatch for U+{:04X}: trie={:?} table={:?}",
cp, &*general, fast
);
}
}
#[test]
fn latin1_table_matches_runtime_lookup_nfkd() {
for cp in 0xC0u32..=0xFF {
let ch = char::from_u32(cp).unwrap();
let mut buf = String::new();
buf.push(ch);
let nfd: Cow<'_, str> = normalize_impl(&buf, Form::Nfd);
let nfkd: Cow<'_, str> = normalize_impl(&buf, Form::Nfkd);
assert_eq!(
&*nfd, &*nfkd,
"NFD/NFKD diverge for U+{:04X}: nfd={:?} nfkd={:?}",
cp, &*nfd, &*nfkd
);
}
}
#[test]
fn passthrough_bound_all_forms_return_0xc0() {
assert_eq!(Form::Nfc.passthrough_bound(), 0xC0);
assert_eq!(Form::Nfd.passthrough_bound(), 0xC0);
assert_eq!(Form::Nfkc.passthrough_bound(), 0xC0);
assert_eq!(Form::Nfkd.passthrough_bound(), 0xC0);
}
#[test]
fn composes_nfc_nfkc_true_nfd_nfkd_false() {
assert!(Form::Nfc.composes());
assert!(Form::Nfkc.composes());
assert!(!Form::Nfd.composes());
assert!(!Form::Nfkd.composes());
}
#[test]
fn decomp_form_canonical_vs_compatible() {
assert_eq!(Form::Nfc.decomp_form(), DecompForm::Canonical);
assert_eq!(Form::Nfd.decomp_form(), DecompForm::Canonical);
assert_eq!(Form::Nfkc.decomp_form(), DecompForm::Compatible);
assert_eq!(Form::Nfkd.decomp_form(), DecompForm::Compatible);
}
#[test]
fn estimated_capacity_nfc_nfkc_same_nfd_nfkd_larger() {
let input_len = 100;
assert_eq!(Form::Nfc.estimated_capacity(input_len), 100);
assert_eq!(Form::Nfkc.estimated_capacity(input_len), 100);
assert_eq!(Form::Nfd.estimated_capacity(input_len), 150);
assert_eq!(Form::Nfkd.estimated_capacity(input_len), 150);
}
#[test]
fn estimated_capacity_zero_length() {
assert_eq!(Form::Nfc.estimated_capacity(0), 0);
assert_eq!(Form::Nfd.estimated_capacity(0), 0);
}
#[test]
fn quick_check_ascii_is_yes_for_all_forms() {
let ascii = "Hello, World!";
assert_eq!(Form::Nfc.quick_check(ascii), quick_check::IsNormalized::Yes);
assert_eq!(Form::Nfd.quick_check(ascii), quick_check::IsNormalized::Yes);
assert_eq!(
Form::Nfkc.quick_check(ascii),
quick_check::IsNormalized::Yes
);
assert_eq!(
Form::Nfkd.quick_check(ascii),
quick_check::IsNormalized::Yes
);
}
#[test]
fn normstate_new_has_no_starter_empty_ccc_buf() {
let state = NormState::new();
assert!(state.current_starter.is_none());
assert!(state.ccc_buf.is_empty());
}
#[test]
fn feed_entry_single_starter_sets_current_starter() {
let mut state = NormState::new();
let mut out = String::new();
state.feed_entry('A', 0, &mut out, false);
assert_eq!(state.current_starter, Some('A'));
assert!(state.ccc_buf.is_empty());
assert!(out.is_empty()); }
#[test]
fn feed_entry_combining_mark_buffers_in_ccc_buf() {
let mut state = NormState::new();
let mut out = String::new();
state.feed_entry('e', 0, &mut out, false);
state.feed_entry('\u{0301}', 230, &mut out, false);
assert_eq!(state.current_starter, Some('e'));
assert!(!state.ccc_buf.is_empty());
assert_eq!(state.ccc_buf.len(), 1);
assert_eq!(state.ccc_buf.as_slice()[0].ch, '\u{0301}');
assert_eq!(state.ccc_buf.as_slice()[0].ccc, 230);
}
#[test]
fn feed_entry_two_starters_first_gets_flushed() {
let mut state = NormState::new();
let mut out = String::new();
state.feed_entry('A', 0, &mut out, false);
assert!(out.is_empty());
state.feed_entry('B', 0, &mut out, false);
assert_eq!(out, "A");
assert_eq!(state.current_starter, Some('B'));
}
#[test]
fn feed_entry_starter_to_starter_composition_hangul_lv() {
let mut state = NormState::new();
let mut out = String::new();
state.feed_entry('\u{1100}', 0, &mut out, true);
state.feed_entry('\u{1161}', 0, &mut out, true);
assert_eq!(state.current_starter, Some('\u{AC00}'));
assert!(out.is_empty());
}
#[test]
fn feed_entry_starter_to_starter_composition_e_acute() {
let mut state = NormState::new();
let mut out = String::new();
state.feed_entry('e', 0, &mut out, true);
state.feed_entry('\u{0301}', 230, &mut out, true);
state.flush(&mut out, true);
assert_eq!(out, "\u{00E9}"); }
#[test]
fn feed_entry_nfd_starters_and_combining_marks() {
let mut state = NormState::new();
let mut out = String::new();
state.feed_entry_nfd('A', 0, &mut out);
assert_eq!(state.current_starter, Some('A'));
state.feed_entry_nfd('\u{0300}', 230, &mut out);
assert_eq!(state.ccc_buf.len(), 1);
state.feed_entry_nfd('B', 0, &mut out);
assert_eq!(out, "A\u{0300}");
assert_eq!(state.current_starter, Some('B'));
}
#[test]
fn flush_no_starter_no_marks_nothing_emitted() {
let mut state = NormState::new();
let mut out = String::new();
state.flush(&mut out, false);
assert!(out.is_empty());
state.flush(&mut out, true);
assert!(out.is_empty());
}
#[test]
fn flush_starter_only_emits_starter() {
let mut state = NormState::new();
let mut out = String::new();
state.current_starter = Some('X');
state.flush(&mut out, false);
assert_eq!(out, "X");
}
#[test]
fn flush_starter_one_combining_mark_no_compose() {
let mut state = NormState::new();
let mut out = String::new();
state.current_starter = Some('e');
state.ccc_buf.push('\u{0301}', 230); state.flush(&mut out, false);
assert_eq!(out, "e\u{0301}");
}
#[test]
fn flush_starter_one_combining_mark_with_compose() {
let mut state = NormState::new();
let mut out = String::new();
state.current_starter = Some('e');
state.ccc_buf.push('\u{0301}', 230); state.flush(&mut out, true);
assert_eq!(out, "\u{00E9}"); }
#[test]
fn flush_starter_multiple_ccc_disordered_marks_emits_sorted() {
let mut state = NormState::new();
let mut out = String::new();
state.current_starter = Some('a');
state.ccc_buf.push('\u{0301}', 230); state.ccc_buf.push('\u{0323}', 220); state.ccc_buf.push('\u{0327}', 202); state.flush(&mut out, false);
let chars: Vec<char> = out.chars().collect();
assert_eq!(chars[0], 'a');
assert_eq!(chars[1], '\u{0327}'); assert_eq!(chars[2], '\u{0323}'); assert_eq!(chars[3], '\u{0301}'); }
#[test]
fn flush_orphan_combining_marks_no_starter_emits_sorted() {
let mut state = NormState::new();
let mut out = String::new();
state.ccc_buf.push('\u{0301}', 230); state.ccc_buf.push('\u{0327}', 202); state.flush(&mut out, false);
let chars: Vec<char> = out.chars().collect();
assert_eq!(chars.len(), 2);
assert_eq!(chars[0], '\u{0327}'); assert_eq!(chars[1], '\u{0301}'); }
#[test]
fn flush_nfd_no_starter_no_marks_nothing_emitted() {
let mut state = NormState::new();
let mut out = String::new();
state.flush_nfd(&mut out);
assert!(out.is_empty());
}
#[test]
fn flush_nfd_starter_only_emits_starter() {
let mut state = NormState::new();
let mut out = String::new();
state.current_starter = Some('Z');
state.flush_nfd(&mut out);
assert_eq!(out, "Z");
}
#[test]
fn flush_nfd_single_mark_fast_path_take_single_inline() {
let mut state = NormState::new();
let mut out = String::new();
state.current_starter = Some('e');
state.ccc_buf.push('\u{0301}', 230); state.flush_nfd(&mut out);
assert_eq!(out, "e\u{0301}");
assert!(state.ccc_buf.is_empty());
}
#[test]
fn flush_nfd_multiple_marks_sorted() {
let mut state = NormState::new();
let mut out = String::new();
state.current_starter = Some('o');
state.ccc_buf.push('\u{0301}', 230); state.ccc_buf.push('\u{0327}', 202); state.flush_nfd(&mut out);
let chars: Vec<char> = out.chars().collect();
assert_eq!(chars[0], 'o');
assert_eq!(chars[1], '\u{0327}'); assert_eq!(chars[2], '\u{0301}'); }
#[test]
fn flush_nfd_orphan_combining_marks_no_starter() {
let mut state = NormState::new();
let mut out = String::new();
state.ccc_buf.push('\u{0301}', 230);
state.ccc_buf.push('\u{0323}', 220);
state.flush_nfd(&mut out);
let chars: Vec<char> = out.chars().collect();
assert_eq!(chars.len(), 2);
assert_eq!(chars[0], '\u{0323}'); assert_eq!(chars[1], '\u{0301}'); }
#[test]
fn normalize_impl_nfc_already_normalized_returns_borrowed() {
let input = "\u{00C5}\u{0300}";
let result = normalize_impl(input, Form::Nfc);
assert!(
matches!(result, Cow::Borrowed(_)),
"Expected Cow::Borrowed for already-NFC input with Maybe QC, got Cow::Owned({:?})",
result
);
assert_eq!(&*result, input);
}
#[test]
fn normalize_impl_nfc_maybe_borrowed_simd_path() {
let mut input = String::new();
input.push_str(&"a".repeat(60));
input.push_str("\u{00C5}\u{0300}"); assert!(input.len() >= 64, "input must be >= 64 bytes for SIMD path");
let result = normalize_impl(&input, Form::Nfc);
assert!(
matches!(result, Cow::Borrowed(_)),
"Expected Cow::Borrowed for >=64 byte already-NFC input with Maybe QC, got Cow::Owned({:?})",
result
);
assert_eq!(&*result, &*input);
}
#[test]
fn normalize_impl_ascii_returns_borrowed() {
let input = "Hello, world!";
let result = normalize_impl(input, Form::Nfc);
assert!(matches!(result, Cow::Borrowed(_)));
assert_eq!(&*result, input);
}
#[test]
fn normalize_impl_nfd_already_decomposed_returns_borrowed() {
let input = "e\u{0301}";
let result = normalize_impl(input, Form::Nfd);
assert!(
matches!(result, Cow::Borrowed(_)),
"Expected Cow::Borrowed for already-NFD input"
);
}
#[test]
fn normalize_impl_nfc_not_normalized_returns_owned() {
let input = "e\u{0301}";
let result = normalize_impl(input, Form::Nfc);
assert!(matches!(result, Cow::Owned(_)));
assert_eq!(&*result, "\u{00E9}");
}
#[test]
fn cjk_unified_extension_a_start() {
assert!(is_cjk_unified(0x3400));
}
#[test]
fn cjk_unified_extension_a_end() {
assert!(is_cjk_unified(0x4DBF));
}
#[test]
fn cjk_unified_main_start() {
assert!(is_cjk_unified(0x4E00));
}
#[test]
fn cjk_unified_main_end() {
assert!(is_cjk_unified(0x9FFF));
}
#[test]
fn cjk_unified_just_before_extension_a() {
assert!(!is_cjk_unified(0x33FF));
}
#[test]
fn cjk_unified_gap_between_extension_a_and_main() {
assert!(!is_cjk_unified(0x4DC0));
}
#[test]
fn cjk_unified_just_after_main() {
assert!(!is_cjk_unified(0xA000));
}
}