use alloc::borrow::Cow;
use alloc::string::String;
use crate::ccc::CccBuffer;
use crate::compose;
use crate::decompose::{self, DecompForm};
use crate::hangul;
use crate::quick_check;
use crate::simd;
use crate::simd::prefetch;
use crate::tables;
use crate::utf8;
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum Form {
Nfc,
Nfd,
Nfkc,
Nfkd,
}
impl Form {
#[inline]
fn passthrough_bound(self) -> u8 {
match self {
Form::Nfc | Form::Nfkc => 0xC0,
Form::Nfd | Form::Nfkd => 0xC0,
}
}
#[inline]
fn composes(self) -> bool {
matches!(self, Form::Nfc | Form::Nfkc)
}
#[inline]
fn decomp_form(self) -> DecompForm {
match self {
Form::Nfc | Form::Nfd => DecompForm::Canonical,
Form::Nfkc | Form::Nfkd => DecompForm::Compatible,
}
}
#[inline]
fn estimated_capacity(self, input_len: usize) -> usize {
match self {
Form::Nfc | Form::Nfkc => input_len,
Form::Nfd | Form::Nfkd => input_len + input_len / 2,
}
}
#[inline]
fn quick_check(self, input: &str) -> quick_check::IsNormalized {
match self {
Form::Nfc => quick_check::quick_check_nfc(input),
Form::Nfd => quick_check::quick_check_nfd(input),
Form::Nfkc => quick_check::quick_check_nfkc(input),
Form::Nfkd => quick_check::quick_check_nfkd(input),
}
}
}
struct NormState {
current_starter: Option<char>,
ccc_buf: CccBuffer,
}
impl NormState {
#[inline]
fn new() -> Self {
NormState {
current_starter: None,
ccc_buf: CccBuffer::new(),
}
}
#[inline]
fn flush(&mut self, out: &mut String, composes: bool) {
let starter = match self.current_starter.take() {
Some(s) => s,
None => {
if !self.ccc_buf.is_empty() {
self.ccc_buf.sort_in_place();
for entry in self.ccc_buf.as_slice() {
out.push(entry.ch);
}
self.ccc_buf.clear();
}
return;
},
};
if self.ccc_buf.is_empty() {
out.push(starter);
return;
}
self.ccc_buf.sort_in_place();
if composes {
compose::compose_combining_sequence_into(starter, self.ccc_buf.as_slice(), out);
} else {
out.push(starter);
for entry in self.ccc_buf.as_slice() {
out.push(entry.ch);
}
}
self.ccc_buf.clear();
}
#[inline]
fn feed_entry(&mut self, ch: char, ccc: u8, out: &mut String, composes: bool) {
if ccc == 0 {
if composes && self.ccc_buf.is_empty() {
if let Some(prev) = self.current_starter
&& let Some(composed) = compose::compose(prev, ch)
{
self.current_starter = Some(composed);
return;
}
}
self.flush(out, composes);
self.current_starter = Some(ch);
} else {
self.ccc_buf.push(ch, ccc);
}
}
#[inline]
fn flush_nfd(&mut self, out: &mut String) {
let starter = match self.current_starter.take() {
Some(s) => s,
None => {
if !self.ccc_buf.is_empty() {
self.ccc_buf.sort_in_place();
for entry in self.ccc_buf.as_slice() {
out.push(entry.ch);
}
self.ccc_buf.clear();
}
return;
},
};
if let Some(entry) = self.ccc_buf.take_single_inline() {
out.push(starter);
out.push(entry.ch);
return;
}
if self.ccc_buf.is_empty() {
out.push(starter);
return;
}
self.ccc_buf.sort_in_place();
out.push(starter);
for entry in self.ccc_buf.as_slice() {
out.push(entry.ch);
}
self.ccc_buf.clear();
}
#[inline]
fn feed_entry_nfd(&mut self, ch: char, ccc: u8, out: &mut String) {
if ccc == 0 {
self.flush_nfd(out);
self.current_starter = Some(ch);
} else {
self.ccc_buf.push(ch, ccc);
}
}
}
#[inline(always)]
fn is_cjk_unified(cp: u32) -> bool {
(0x4E00..=0x9FFF).contains(&cp) || (0x3400..=0x4DBF).contains(&cp)
}
#[inline(always)]
fn is_supp_safe(cp: u32) -> bool {
if cp >= 0x20000 {
return !(0x2F800..=0x2FA1F).contains(&cp);
}
(0x1F252..=0x1FBEF).contains(&cp)
}
#[inline]
fn process_char(
ch: char,
state: &mut NormState,
out: &mut String,
form: Form,
decomp_buf: &mut CccBuffer,
) {
let cp = ch as u32;
if cp >= 0x3400 && is_cjk_unified(cp) {
state.flush(out, form.composes());
state.current_starter = Some(ch);
return;
}
if hangul::is_hangul_syllable(ch) {
let (l, v, t) = hangul::decompose_hangul(ch);
state.feed_entry(l, 0, out, form.composes());
state.feed_entry(v, 0, out, form.composes());
if let Some(t_char) = t {
state.feed_entry(t_char, 0, out, form.composes());
}
return;
}
let trie_value = tables::raw_decomp_trie_value(ch, form.decomp_form());
if !tables::has_decomposition(trie_value) {
let ccc = tables::ccc_from_trie_value(trie_value);
state.feed_entry(ch, ccc, out, form.composes());
return;
}
decomp_buf.clear();
decompose::decompose_from_trie_value(ch, trie_value, decomp_buf, form.decomp_form());
for entry in decomp_buf.as_slice() {
state.feed_entry(entry.ch, entry.ccc, out, form.composes());
}
}
#[allow(dead_code)]
#[inline(always)]
fn process_from_trie(
ch: char,
tv: u32,
state: &mut NormState,
out: &mut String,
form: Form,
decomp_buf: &mut CccBuffer,
) {
if !tables::has_decomposition(tv) {
let ccc = tables::ccc_from_trie_value(tv);
state.feed_entry(ch, ccc, out, form.composes());
} else {
decomp_buf.clear();
decompose::decompose_from_trie_value(ch, tv, decomp_buf, form.decomp_form());
for entry in decomp_buf.as_slice() {
state.feed_entry(entry.ch, entry.ccc, out, form.composes());
}
}
}
#[inline(always)]
fn process_from_trie_nfd(
ch: char,
tv: u32,
state: &mut NormState,
out: &mut String,
decomp_form: DecompForm,
) {
if !tables::has_decomposition(tv) {
let ccc = tables::ccc_from_trie_value(tv);
state.feed_entry_nfd(ch, ccc, out);
return;
}
if let Some(data) = tables::expansion_data_from_trie_value(tv, decomp_form) {
if data.len() == 2 {
let e0 = data[0];
let ccc0 = (e0 >> tables::EXPANSION_CCC_SHIFT) as u8;
if ccc0 == 0 {
state.flush_nfd(out);
let cp0 = e0 & tables::EXPANSION_CP_MASK;
debug_assert!(cp0 <= 0x10FFFF && !(0xD800..=0xDFFF).contains(&cp0));
state.current_starter = Some(unsafe { char::from_u32_unchecked(cp0) });
let e1 = data[1];
let cp1 = e1 & tables::EXPANSION_CP_MASK;
let ccc1 = (e1 >> tables::EXPANSION_CCC_SHIFT) as u8;
debug_assert!(cp1 <= 0x10FFFF && !(0xD800..=0xDFFF).contains(&cp1));
let ch1 = unsafe { char::from_u32_unchecked(cp1) };
if ccc1 != 0 {
state.ccc_buf.push(ch1, ccc1);
} else {
state.feed_entry_nfd(ch1, 0, out);
}
return;
}
}
for &entry in data {
let cp = entry & tables::EXPANSION_CP_MASK;
let ccc = (entry >> tables::EXPANSION_CCC_SHIFT) as u8;
debug_assert!(cp <= 0x10FFFF && !(0xD800..=0xDFFF).contains(&cp));
let exp_ch = unsafe { char::from_u32_unchecked(cp) };
state.feed_entry_nfd(exp_ch, ccc, out);
}
return;
}
let info = tv & 0xFFFF;
debug_assert!(info <= 0xD7FF || (0xE000..=0xFFFF).contains(&info));
let decomposed = unsafe { char::from_u32_unchecked(info) };
let ccc = if info <= 0x7F {
0
} else {
tables::lookup_ccc(decomposed)
};
state.feed_entry_nfd(decomposed, ccc, out);
}
#[inline(always)]
fn flush_compose_passthrough(
pass: &str,
ch: char,
form: Form,
state: &mut NormState,
out: &mut String,
) {
let cp = ch as u32;
let next_tv = if cp >= 0x10000 {
unsafe { tables::raw_decomp_trie_value_supplementary(cp, form.decomp_form()) }
} else {
tables::raw_decomp_trie_value(ch, form.decomp_form())
};
if tables::needs_starter_shadow(next_tv) {
let n = pass.len();
if n > 1 {
out.push_str(&pass[..n - 1]);
}
let last_ch = pass.as_bytes()[n - 1] as char;
state.feed_entry(last_ch, 0, out, true);
} else {
out.push_str(pass);
}
}
fn normalize_scalar<'a>(input: &'a str, form: Form) -> Cow<'a, str> {
if input.is_empty() {
return Cow::Borrowed(input);
}
if form.quick_check(input) == quick_check::IsNormalized::Yes {
return Cow::Borrowed(input);
}
let mut out = String::with_capacity(input.len());
let mut state = NormState::new();
let mut decomp_buf = CccBuffer::new();
for ch in input.chars() {
process_char(ch, &mut state, &mut out, form, &mut decomp_buf);
}
state.flush(&mut out, form.composes());
if out == input {
Cow::Borrowed(input)
} else {
Cow::Owned(out)
}
}
fn normalize_impl<'a>(input: &'a str, form: Form) -> Cow<'a, str> {
let bytes = input.as_bytes();
let len = bytes.len();
if len < 64 {
return normalize_scalar(input, form);
}
let qc = form.quick_check(input);
if qc == quick_check::IsNormalized::Yes {
return Cow::Borrowed(input);
}
let bound = form.passthrough_bound();
let composes = form.composes();
let mut out = String::with_capacity(form.estimated_capacity(len));
let mut last_written: usize = 0;
let mut state = NormState::new();
let mut decomp_buf = CccBuffer::new();
let mut pos: usize = 0;
let ptr = bytes.as_ptr();
while pos + 64 <= len {
let chunk_start = pos;
let mask = unsafe {
let prefetch_l1 =
ptr.wrapping_add(pos + prefetch::PREFETCH_L1_DISTANCE * prefetch::CHUNK_SIZE);
let prefetch_l2 =
ptr.wrapping_add(pos + prefetch::PREFETCH_L2_DISTANCE * prefetch::CHUNK_SIZE);
simd::scan_and_prefetch(ptr.add(pos), prefetch_l1, prefetch_l2, bound)
};
unsafe {
let write_head = out.len();
let distance = prefetch::PREFETCH_L1_DISTANCE * prefetch::CHUNK_SIZE;
if write_head + distance <= out.capacity() {
prefetch::prefetch_write(out.as_ptr().wrapping_add(write_head + distance));
}
}
if mask == 0 {
pos += 64;
continue;
}
let mut chunk_mask = mask;
while chunk_mask != 0 {
let bit_pos = chunk_mask.trailing_zeros() as usize;
chunk_mask &= chunk_mask.wrapping_sub(1);
let byte_pos = chunk_start + bit_pos;
if byte_pos < last_written {
continue;
}
if utf8::is_continuation_byte(bytes[byte_pos]) {
continue;
}
let (ch, width) = utf8::decode_char_at(bytes, byte_pos);
if !composes {
let cp = ch as u32;
if (cp >= 0x3400 && is_cjk_unified(cp)) || (cp >= 0x10000 && is_supp_safe(cp)) {
continue;
}
if hangul::is_hangul_syllable(ch) {
if byte_pos > last_written {
state.flush_nfd(&mut out);
out.push_str(&input[last_written..byte_pos]);
}
last_written = byte_pos + width;
state.flush_nfd(&mut out);
let (l, v, t) = hangul::decompose_hangul(ch);
out.push(l);
out.push(v);
if let Some(t_char) = t {
out.push(t_char);
}
continue;
}
let tv = if cp >= 0x10000 {
unsafe { tables::raw_decomp_trie_value_supplementary(cp, form.decomp_form()) }
} else {
tables::raw_decomp_trie_value(ch, form.decomp_form())
};
if !tables::has_decomposition(tv) && tables::ccc_from_trie_value(tv) == 0 {
continue; }
if byte_pos > last_written {
state.flush_nfd(&mut out);
out.push_str(&input[last_written..byte_pos]);
}
last_written = byte_pos + width;
process_from_trie_nfd(ch, tv, &mut state, &mut out, form.decomp_form());
continue;
}
if byte_pos > last_written {
state.flush(&mut out, composes);
let pass = &input[last_written..byte_pos];
if composes {
flush_compose_passthrough(pass, ch, form, &mut state, &mut out);
} else {
out.push_str(pass);
}
}
last_written = byte_pos + width;
process_char(ch, &mut state, &mut out, form, &mut decomp_buf);
}
pos += 64;
}
if pos < len {
let tail_has_work = bytes[pos..].iter().any(|&b| b >= bound);
if tail_has_work {
let mut tail_pos = pos;
while tail_pos < len {
if tail_pos < last_written {
tail_pos += 1;
continue;
}
if utf8::is_continuation_byte(bytes[tail_pos]) {
tail_pos += 1;
continue;
}
let (ch, width) = utf8::decode_char_at(bytes, tail_pos);
if !composes {
let cp = ch as u32;
if (cp >= 0x3400 && is_cjk_unified(cp)) || (cp >= 0x10000 && is_supp_safe(cp)) {
tail_pos += width;
continue;
}
if hangul::is_hangul_syllable(ch) {
if tail_pos > last_written {
state.flush_nfd(&mut out);
out.push_str(&input[last_written..tail_pos]);
}
last_written = tail_pos + width;
state.flush_nfd(&mut out);
let (l, v, t) = hangul::decompose_hangul(ch);
out.push(l);
out.push(v);
if let Some(t_char) = t {
out.push(t_char);
}
tail_pos += width;
continue;
}
let tv = if cp >= 0x10000 {
unsafe {
tables::raw_decomp_trie_value_supplementary(cp, form.decomp_form())
}
} else {
tables::raw_decomp_trie_value(ch, form.decomp_form())
};
if !tables::has_decomposition(tv) && tables::ccc_from_trie_value(tv) == 0 {
tail_pos += width;
continue;
}
if tail_pos > last_written {
state.flush_nfd(&mut out);
out.push_str(&input[last_written..tail_pos]);
}
last_written = tail_pos + width;
process_from_trie_nfd(ch, tv, &mut state, &mut out, form.decomp_form());
tail_pos += width;
continue;
}
if tail_pos > last_written {
state.flush(&mut out, composes);
let pass = &input[last_written..tail_pos];
if composes {
flush_compose_passthrough(pass, ch, form, &mut state, &mut out);
} else {
out.push_str(pass);
}
}
last_written = tail_pos + width;
process_char(ch, &mut state, &mut out, form, &mut decomp_buf);
tail_pos += width;
}
}
}
if composes {
state.flush(&mut out, true);
} else {
state.flush_nfd(&mut out);
}
if last_written < len {
out.push_str(&input[last_written..len]);
}
if qc == quick_check::IsNormalized::Maybe && out == input {
Cow::Borrowed(input)
} else {
Cow::Owned(out)
}
}
pub struct NfcNormalizer;
pub struct NfdNormalizer;
pub struct NfkcNormalizer;
pub struct NfkdNormalizer;
impl Default for NfcNormalizer {
fn default() -> Self {
Self::new()
}
}
impl Default for NfdNormalizer {
fn default() -> Self {
Self::new()
}
}
impl Default for NfkcNormalizer {
fn default() -> Self {
Self::new()
}
}
impl Default for NfkdNormalizer {
fn default() -> Self {
Self::new()
}
}
impl NfcNormalizer {
pub fn new() -> Self {
NfcNormalizer
}
pub fn quick_check(&self, input: &str) -> crate::quick_check::IsNormalized {
quick_check::quick_check_nfc(input)
}
pub fn normalize<'a>(&self, input: &'a str) -> Cow<'a, str> {
normalize_impl(input, Form::Nfc)
}
pub fn normalize_to(&self, input: &str, out: &mut String) -> bool {
let result = normalize_impl(input, Form::Nfc);
let already_normalized = matches!(&result, Cow::Borrowed(_));
out.push_str(&result);
already_normalized
}
pub fn is_normalized(&self, input: &str) -> bool {
quick_check::is_normalized_nfc(input)
}
}
impl NfdNormalizer {
pub fn new() -> Self {
NfdNormalizer
}
pub fn quick_check(&self, input: &str) -> crate::quick_check::IsNormalized {
quick_check::quick_check_nfd(input)
}
pub fn normalize<'a>(&self, input: &'a str) -> Cow<'a, str> {
normalize_impl(input, Form::Nfd)
}
pub fn normalize_to(&self, input: &str, out: &mut String) -> bool {
let result = normalize_impl(input, Form::Nfd);
let already_normalized = matches!(&result, Cow::Borrowed(_));
out.push_str(&result);
already_normalized
}
pub fn is_normalized(&self, input: &str) -> bool {
quick_check::is_normalized_nfd(input)
}
}
impl NfkcNormalizer {
pub fn new() -> Self {
NfkcNormalizer
}
pub fn quick_check(&self, input: &str) -> crate::quick_check::IsNormalized {
quick_check::quick_check_nfkc(input)
}
pub fn normalize<'a>(&self, input: &'a str) -> Cow<'a, str> {
normalize_impl(input, Form::Nfkc)
}
pub fn normalize_to(&self, input: &str, out: &mut String) -> bool {
let result = normalize_impl(input, Form::Nfkc);
let already_normalized = matches!(&result, Cow::Borrowed(_));
out.push_str(&result);
already_normalized
}
pub fn is_normalized(&self, input: &str) -> bool {
quick_check::is_normalized_nfkc(input)
}
}
impl NfkdNormalizer {
pub fn new() -> Self {
NfkdNormalizer
}
pub fn quick_check(&self, input: &str) -> crate::quick_check::IsNormalized {
quick_check::quick_check_nfkd(input)
}
pub fn normalize<'a>(&self, input: &'a str) -> Cow<'a, str> {
normalize_impl(input, Form::Nfkd)
}
pub fn normalize_to(&self, input: &str, out: &mut String) -> bool {
let result = normalize_impl(input, Form::Nfkd);
let already_normalized = matches!(&result, Cow::Borrowed(_));
out.push_str(&result);
already_normalized
}
pub fn is_normalized(&self, input: &str) -> bool {
quick_check::is_normalized_nfkd(input)
}
}
#[cfg(test)]
mod tests {
use super::*;
use alloc::borrow::Cow;
use alloc::string::String;
use alloc::vec::Vec;
#[test]
fn passthrough_bound_all_forms_return_0xc0() {
assert_eq!(Form::Nfc.passthrough_bound(), 0xC0);
assert_eq!(Form::Nfd.passthrough_bound(), 0xC0);
assert_eq!(Form::Nfkc.passthrough_bound(), 0xC0);
assert_eq!(Form::Nfkd.passthrough_bound(), 0xC0);
}
#[test]
fn composes_nfc_nfkc_true_nfd_nfkd_false() {
assert!(Form::Nfc.composes());
assert!(Form::Nfkc.composes());
assert!(!Form::Nfd.composes());
assert!(!Form::Nfkd.composes());
}
#[test]
fn decomp_form_canonical_vs_compatible() {
assert_eq!(Form::Nfc.decomp_form(), DecompForm::Canonical);
assert_eq!(Form::Nfd.decomp_form(), DecompForm::Canonical);
assert_eq!(Form::Nfkc.decomp_form(), DecompForm::Compatible);
assert_eq!(Form::Nfkd.decomp_form(), DecompForm::Compatible);
}
#[test]
fn estimated_capacity_nfc_nfkc_same_nfd_nfkd_larger() {
let input_len = 100;
assert_eq!(Form::Nfc.estimated_capacity(input_len), 100);
assert_eq!(Form::Nfkc.estimated_capacity(input_len), 100);
assert_eq!(Form::Nfd.estimated_capacity(input_len), 150);
assert_eq!(Form::Nfkd.estimated_capacity(input_len), 150);
}
#[test]
fn estimated_capacity_zero_length() {
assert_eq!(Form::Nfc.estimated_capacity(0), 0);
assert_eq!(Form::Nfd.estimated_capacity(0), 0);
}
#[test]
fn quick_check_ascii_is_yes_for_all_forms() {
let ascii = "Hello, World!";
assert_eq!(Form::Nfc.quick_check(ascii), quick_check::IsNormalized::Yes);
assert_eq!(Form::Nfd.quick_check(ascii), quick_check::IsNormalized::Yes);
assert_eq!(
Form::Nfkc.quick_check(ascii),
quick_check::IsNormalized::Yes
);
assert_eq!(
Form::Nfkd.quick_check(ascii),
quick_check::IsNormalized::Yes
);
}
#[test]
fn normstate_new_has_no_starter_empty_ccc_buf() {
let state = NormState::new();
assert!(state.current_starter.is_none());
assert!(state.ccc_buf.is_empty());
}
#[test]
fn feed_entry_single_starter_sets_current_starter() {
let mut state = NormState::new();
let mut out = String::new();
state.feed_entry('A', 0, &mut out, false);
assert_eq!(state.current_starter, Some('A'));
assert!(state.ccc_buf.is_empty());
assert!(out.is_empty()); }
#[test]
fn feed_entry_combining_mark_buffers_in_ccc_buf() {
let mut state = NormState::new();
let mut out = String::new();
state.feed_entry('e', 0, &mut out, false);
state.feed_entry('\u{0301}', 230, &mut out, false);
assert_eq!(state.current_starter, Some('e'));
assert!(!state.ccc_buf.is_empty());
assert_eq!(state.ccc_buf.len(), 1);
assert_eq!(state.ccc_buf.as_slice()[0].ch, '\u{0301}');
assert_eq!(state.ccc_buf.as_slice()[0].ccc, 230);
}
#[test]
fn feed_entry_two_starters_first_gets_flushed() {
let mut state = NormState::new();
let mut out = String::new();
state.feed_entry('A', 0, &mut out, false);
assert!(out.is_empty());
state.feed_entry('B', 0, &mut out, false);
assert_eq!(out, "A");
assert_eq!(state.current_starter, Some('B'));
}
#[test]
fn feed_entry_starter_to_starter_composition_hangul_lv() {
let mut state = NormState::new();
let mut out = String::new();
state.feed_entry('\u{1100}', 0, &mut out, true);
state.feed_entry('\u{1161}', 0, &mut out, true);
assert_eq!(state.current_starter, Some('\u{AC00}'));
assert!(out.is_empty());
}
#[test]
fn feed_entry_starter_to_starter_composition_e_acute() {
let mut state = NormState::new();
let mut out = String::new();
state.feed_entry('e', 0, &mut out, true);
state.feed_entry('\u{0301}', 230, &mut out, true);
state.flush(&mut out, true);
assert_eq!(out, "\u{00E9}"); }
#[test]
fn feed_entry_nfd_starters_and_combining_marks() {
let mut state = NormState::new();
let mut out = String::new();
state.feed_entry_nfd('A', 0, &mut out);
assert_eq!(state.current_starter, Some('A'));
state.feed_entry_nfd('\u{0300}', 230, &mut out);
assert_eq!(state.ccc_buf.len(), 1);
state.feed_entry_nfd('B', 0, &mut out);
assert_eq!(out, "A\u{0300}");
assert_eq!(state.current_starter, Some('B'));
}
#[test]
fn flush_no_starter_no_marks_nothing_emitted() {
let mut state = NormState::new();
let mut out = String::new();
state.flush(&mut out, false);
assert!(out.is_empty());
state.flush(&mut out, true);
assert!(out.is_empty());
}
#[test]
fn flush_starter_only_emits_starter() {
let mut state = NormState::new();
let mut out = String::new();
state.current_starter = Some('X');
state.flush(&mut out, false);
assert_eq!(out, "X");
}
#[test]
fn flush_starter_one_combining_mark_no_compose() {
let mut state = NormState::new();
let mut out = String::new();
state.current_starter = Some('e');
state.ccc_buf.push('\u{0301}', 230); state.flush(&mut out, false);
assert_eq!(out, "e\u{0301}");
}
#[test]
fn flush_starter_one_combining_mark_with_compose() {
let mut state = NormState::new();
let mut out = String::new();
state.current_starter = Some('e');
state.ccc_buf.push('\u{0301}', 230); state.flush(&mut out, true);
assert_eq!(out, "\u{00E9}"); }
#[test]
fn flush_starter_multiple_ccc_disordered_marks_emits_sorted() {
let mut state = NormState::new();
let mut out = String::new();
state.current_starter = Some('a');
state.ccc_buf.push('\u{0301}', 230); state.ccc_buf.push('\u{0323}', 220); state.ccc_buf.push('\u{0327}', 202); state.flush(&mut out, false);
let chars: Vec<char> = out.chars().collect();
assert_eq!(chars[0], 'a');
assert_eq!(chars[1], '\u{0327}'); assert_eq!(chars[2], '\u{0323}'); assert_eq!(chars[3], '\u{0301}'); }
#[test]
fn flush_orphan_combining_marks_no_starter_emits_sorted() {
let mut state = NormState::new();
let mut out = String::new();
state.ccc_buf.push('\u{0301}', 230); state.ccc_buf.push('\u{0327}', 202); state.flush(&mut out, false);
let chars: Vec<char> = out.chars().collect();
assert_eq!(chars.len(), 2);
assert_eq!(chars[0], '\u{0327}'); assert_eq!(chars[1], '\u{0301}'); }
#[test]
fn flush_nfd_no_starter_no_marks_nothing_emitted() {
let mut state = NormState::new();
let mut out = String::new();
state.flush_nfd(&mut out);
assert!(out.is_empty());
}
#[test]
fn flush_nfd_starter_only_emits_starter() {
let mut state = NormState::new();
let mut out = String::new();
state.current_starter = Some('Z');
state.flush_nfd(&mut out);
assert_eq!(out, "Z");
}
#[test]
fn flush_nfd_single_mark_fast_path_take_single_inline() {
let mut state = NormState::new();
let mut out = String::new();
state.current_starter = Some('e');
state.ccc_buf.push('\u{0301}', 230); state.flush_nfd(&mut out);
assert_eq!(out, "e\u{0301}");
assert!(state.ccc_buf.is_empty());
}
#[test]
fn flush_nfd_multiple_marks_sorted() {
let mut state = NormState::new();
let mut out = String::new();
state.current_starter = Some('o');
state.ccc_buf.push('\u{0301}', 230); state.ccc_buf.push('\u{0327}', 202); state.flush_nfd(&mut out);
let chars: Vec<char> = out.chars().collect();
assert_eq!(chars[0], 'o');
assert_eq!(chars[1], '\u{0327}'); assert_eq!(chars[2], '\u{0301}'); }
#[test]
fn flush_nfd_orphan_combining_marks_no_starter() {
let mut state = NormState::new();
let mut out = String::new();
state.ccc_buf.push('\u{0301}', 230);
state.ccc_buf.push('\u{0323}', 220);
state.flush_nfd(&mut out);
let chars: Vec<char> = out.chars().collect();
assert_eq!(chars.len(), 2);
assert_eq!(chars[0], '\u{0323}'); assert_eq!(chars[1], '\u{0301}'); }
#[test]
fn normalize_impl_nfc_already_normalized_returns_borrowed() {
let input = "\u{00C5}\u{0300}";
let result = normalize_impl(input, Form::Nfc);
assert!(
matches!(result, Cow::Borrowed(_)),
"Expected Cow::Borrowed for already-NFC input with Maybe QC, got Cow::Owned({:?})",
result
);
assert_eq!(&*result, input);
}
#[test]
fn normalize_impl_nfc_maybe_borrowed_simd_path() {
let mut input = String::new();
input.push_str(&"a".repeat(60));
input.push_str("\u{00C5}\u{0300}"); assert!(input.len() >= 64, "input must be >= 64 bytes for SIMD path");
let result = normalize_impl(&input, Form::Nfc);
assert!(
matches!(result, Cow::Borrowed(_)),
"Expected Cow::Borrowed for >=64 byte already-NFC input with Maybe QC, got Cow::Owned({:?})",
result
);
assert_eq!(&*result, &*input);
}
#[test]
fn normalize_impl_ascii_returns_borrowed() {
let input = "Hello, world!";
let result = normalize_impl(input, Form::Nfc);
assert!(matches!(result, Cow::Borrowed(_)));
assert_eq!(&*result, input);
}
#[test]
fn normalize_impl_nfd_already_decomposed_returns_borrowed() {
let input = "e\u{0301}";
let result = normalize_impl(input, Form::Nfd);
assert!(
matches!(result, Cow::Borrowed(_)),
"Expected Cow::Borrowed for already-NFD input"
);
}
#[test]
fn normalize_impl_nfc_not_normalized_returns_owned() {
let input = "e\u{0301}";
let result = normalize_impl(input, Form::Nfc);
assert!(matches!(result, Cow::Owned(_)));
assert_eq!(&*result, "\u{00E9}");
}
#[test]
fn cjk_unified_extension_a_start() {
assert!(is_cjk_unified(0x3400));
}
#[test]
fn cjk_unified_extension_a_end() {
assert!(is_cjk_unified(0x4DBF));
}
#[test]
fn cjk_unified_main_start() {
assert!(is_cjk_unified(0x4E00));
}
#[test]
fn cjk_unified_main_end() {
assert!(is_cjk_unified(0x9FFF));
}
#[test]
fn cjk_unified_just_before_extension_a() {
assert!(!is_cjk_unified(0x33FF));
}
#[test]
fn cjk_unified_gap_between_extension_a_and_main() {
assert!(!is_cjk_unified(0x4DC0));
}
#[test]
fn cjk_unified_just_after_main() {
assert!(!is_cjk_unified(0xA000));
}
#[test]
fn supp_safe_plane2_start() {
assert!(is_supp_safe(0x20000));
}
#[test]
fn supp_safe_cjk_compat_supplement_start() {
assert!(!is_supp_safe(0x2F800));
}
#[test]
fn supp_safe_cjk_compat_supplement_end() {
assert!(!is_supp_safe(0x2FA1F));
}
#[test]
fn supp_safe_just_after_compat_supplement() {
assert!(is_supp_safe(0x2FA20));
}
#[test]
fn supp_safe_plane1_safe_range_start() {
assert!(is_supp_safe(0x1F252));
}
#[test]
fn supp_safe_plane1_safe_range_end() {
assert!(is_supp_safe(0x1FBEF));
}
#[test]
fn supp_safe_just_before_plane1_safe_range() {
assert!(!is_supp_safe(0x1F251));
}
#[test]
fn supp_safe_just_after_plane1_safe_range() {
assert!(!is_supp_safe(0x1FBF0));
}
#[test]
fn supp_safe_smp_start_before_safe_range() {
assert!(!is_supp_safe(0x10000));
}
}