use crate::simd;
use crate::tables;
use crate::utf8;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum IsNormalized {
Yes,
No,
Maybe,
}
#[inline]
fn qc_value_to_result(v: u8) -> IsNormalized {
match v {
0 => IsNormalized::Yes,
1 => IsNormalized::Maybe,
_ => IsNormalized::No,
}
}
#[inline(always)]
fn is_cjk_unified(cp: u32) -> bool {
(0x4E00..=0x9FFF).contains(&cp) || (0x3400..=0x4DBF).contains(&cp)
}
#[inline(always)]
fn is_supp_safe(cp: u32) -> bool {
if cp >= 0x20000 {
return !(0x2F800..=0x2FA1F).contains(&cp);
}
(0x1F252..=0x1FBEF).contains(&cp)
}
#[inline(always)]
fn is_kana(cp: u32) -> bool {
(0x3041..0x3099).contains(&cp)
|| cp == 0x309D
|| cp == 0x309E
|| (0x30A0..=0x30FE).contains(&cp)
}
#[inline]
fn quick_check_impl(
input: &str,
qc_shift: u32,
simd_bound: u8,
safe_below: u32,
hangul_safe: bool,
kana_safe: bool,
latin1_upper_safe: bool,
) -> IsNormalized {
let bytes = input.as_bytes();
let len = bytes.len();
if len < 64 {
return quick_check_scalar(
input,
qc_shift,
safe_below,
hangul_safe,
kana_safe,
latin1_upper_safe,
);
}
let ptr = bytes.as_ptr();
let mut last_ccc: u8 = 0;
let mut result = IsNormalized::Yes;
let mut processed_up_to: usize = 0;
let mut pos: usize = 0;
while pos + 64 <= len {
let mask = unsafe { simd::scan_chunk(ptr.add(pos), simd_bound) };
let chunk_end = pos + 64;
if mask == 0 {
last_ccc = 0;
processed_up_to = chunk_end;
pos = chunk_end;
continue;
}
let chunk_start = pos;
let mut chunk_mask = mask;
while chunk_mask != 0 {
let bit_pos = chunk_mask.trailing_zeros() as usize;
chunk_mask &= chunk_mask.wrapping_sub(1);
let byte_pos = chunk_start + bit_pos;
if byte_pos < processed_up_to {
continue;
}
if byte_pos > processed_up_to {
last_ccc = 0;
}
let (ch, width) = utf8::decode_char_at(bytes, byte_pos);
processed_up_to = byte_pos + width;
let cp = ch as u32;
if cp < safe_below
|| (latin1_upper_safe && (0x00C0..0x0100).contains(&cp))
|| is_cjk_unified(cp)
|| (hangul_safe && (0xAC00..=0xD7A3).contains(&cp))
|| (kana_safe && is_kana(cp))
|| (cp >= 0x10000 && is_supp_safe(cp))
{
last_ccc = 0;
continue;
}
let (ccc, qc) = tables::lookup_ccc_qc(ch, qc_shift);
if ccc != 0 && last_ccc > ccc {
return IsNormalized::No;
}
match qc_value_to_result(qc) {
IsNormalized::No => return IsNormalized::No,
IsNormalized::Maybe => result = IsNormalized::Maybe,
IsNormalized::Yes => {},
}
last_ccc = ccc;
}
if processed_up_to < chunk_end {
last_ccc = 0;
processed_up_to = chunk_end;
}
pos = chunk_end;
}
let tail_start = processed_up_to.max(pos);
if tail_start > processed_up_to {
last_ccc = 0;
}
let mut tail_pos = tail_start;
while tail_pos < len {
let b = bytes[tail_pos];
if b < 0x80 {
last_ccc = 0;
tail_pos += 1;
continue;
}
if utf8::is_continuation_byte(b) {
tail_pos += 1;
continue;
}
let (ch, width) = utf8::decode_char_at(bytes, tail_pos);
let cp = ch as u32;
if cp < safe_below
|| (latin1_upper_safe && (0x00C0..0x0100).contains(&cp))
|| is_cjk_unified(cp)
|| (hangul_safe && (0xAC00..=0xD7A3).contains(&cp))
|| (cp >= 0x10000 && is_supp_safe(cp))
{
last_ccc = 0;
tail_pos += width;
continue;
}
let (ccc, qc) = tables::lookup_ccc_qc(ch, qc_shift);
if ccc != 0 && last_ccc > ccc {
return IsNormalized::No;
}
match qc_value_to_result(qc) {
IsNormalized::No => return IsNormalized::No,
IsNormalized::Maybe => result = IsNormalized::Maybe,
IsNormalized::Yes => {},
}
last_ccc = ccc;
tail_pos += width;
}
result
}
#[inline]
fn quick_check_scalar(
input: &str,
qc_shift: u32,
safe_below: u32,
hangul_safe: bool,
kana_safe: bool,
latin1_upper_safe: bool,
) -> IsNormalized {
let mut last_ccc: u8 = 0;
let mut result = IsNormalized::Yes;
for ch in input.chars() {
let cp = ch as u32;
if cp <= 0x7F {
last_ccc = 0;
continue;
}
if cp < safe_below
|| (latin1_upper_safe && (0x00C0..0x0100).contains(&cp))
|| is_cjk_unified(cp)
|| (hangul_safe && (0xAC00..=0xD7A3).contains(&cp))
|| (kana_safe && is_kana(cp))
|| (cp >= 0x10000 && is_supp_safe(cp))
{
last_ccc = 0;
continue;
}
let (ccc, qc) = tables::lookup_ccc_qc(ch, qc_shift);
if ccc != 0 && last_ccc > ccc {
return IsNormalized::No;
}
match qc_value_to_result(qc) {
IsNormalized::No => return IsNormalized::No,
IsNormalized::Maybe => result = IsNormalized::Maybe,
IsNormalized::Yes => {},
}
last_ccc = ccc;
}
result
}
#[cfg(not(feature = "quick_check_oracle"))]
pub(crate) fn quick_check_nfc(input: &str) -> IsNormalized {
quick_check_impl(
input,
tables::CCC_QC_NFC_SHIFT,
0xCC,
0x0300,
true,
true,
true,
)
}
#[cfg(feature = "quick_check_oracle")]
pub fn quick_check_nfc(input: &str) -> IsNormalized {
quick_check_impl(
input,
tables::CCC_QC_NFC_SHIFT,
0xCC,
0x0300,
true,
true,
true,
)
}
#[cfg(not(feature = "quick_check_oracle"))]
pub(crate) fn quick_check_nfd(input: &str) -> IsNormalized {
quick_check_impl(
input,
tables::CCC_QC_NFD_SHIFT,
0xC3,
0x00C0,
false,
false,
false,
)
}
#[cfg(feature = "quick_check_oracle")]
pub fn quick_check_nfd(input: &str) -> IsNormalized {
quick_check_impl(
input,
tables::CCC_QC_NFD_SHIFT,
0xC3,
0x00C0,
false,
false,
false,
)
}
#[cfg(not(feature = "quick_check_oracle"))]
pub(crate) fn quick_check_nfkc(input: &str) -> IsNormalized {
quick_check_impl(
input,
tables::CCC_QC_NFKC_SHIFT,
0xC0,
0x00A0,
true,
true,
true,
)
}
#[cfg(feature = "quick_check_oracle")]
pub fn quick_check_nfkc(input: &str) -> IsNormalized {
quick_check_impl(
input,
tables::CCC_QC_NFKC_SHIFT,
0xC0,
0x00A0,
true,
true,
true,
)
}
#[cfg(not(feature = "quick_check_oracle"))]
pub(crate) fn quick_check_nfkd(input: &str) -> IsNormalized {
quick_check_impl(
input,
tables::CCC_QC_NFKD_SHIFT,
0xC0,
0x00A0,
false,
false,
false,
)
}
#[cfg(feature = "quick_check_oracle")]
pub fn quick_check_nfkd(input: &str) -> IsNormalized {
quick_check_impl(
input,
tables::CCC_QC_NFKD_SHIFT,
0xC0,
0x00A0,
false,
false,
false,
)
}
#[cfg(feature = "quick_check_oracle")]
#[inline]
fn quick_check_impl_oracle(
input: &str,
qc_shift: u32,
simd_bound: u8,
safe_below: u32,
hangul_safe: bool,
kana_safe: bool,
) -> IsNormalized {
let bytes = input.as_bytes();
let len = bytes.len();
if len < 64 {
return quick_check_scalar(input, qc_shift, safe_below, hangul_safe, kana_safe, false);
}
let ptr = bytes.as_ptr();
let mut last_ccc: u8 = 0;
let mut result = IsNormalized::Yes;
let mut processed_up_to: usize = 0;
let mut pos: usize = 0;
while pos + 64 <= len {
let mask = unsafe { simd::scan_chunk(ptr.add(pos), simd_bound) };
let chunk_end = pos + 64;
if mask == 0 {
last_ccc = 0;
processed_up_to = chunk_end;
pos = chunk_end;
continue;
}
let chunk_start = pos;
let mut chunk_mask = mask;
while chunk_mask != 0 {
let bit_pos = chunk_mask.trailing_zeros() as usize;
chunk_mask &= chunk_mask.wrapping_sub(1);
let byte_pos = chunk_start + bit_pos;
if byte_pos < processed_up_to {
continue;
}
if byte_pos > processed_up_to {
last_ccc = 0;
}
let (ch, width) = utf8::decode_char_at(bytes, byte_pos);
processed_up_to = byte_pos + width;
let cp = ch as u32;
if cp < safe_below
|| is_cjk_unified(cp)
|| (hangul_safe && (0xAC00..=0xD7A3).contains(&cp))
|| (kana_safe && is_kana(cp))
|| (cp >= 0x10000 && is_supp_safe(cp))
{
last_ccc = 0;
continue;
}
let (ccc, qc) = tables::lookup_ccc_qc(ch, qc_shift);
if ccc != 0 && last_ccc > ccc {
return IsNormalized::No;
}
match qc_value_to_result(qc) {
IsNormalized::No => return IsNormalized::No,
IsNormalized::Maybe => result = IsNormalized::Maybe,
IsNormalized::Yes => {},
}
last_ccc = ccc;
}
if processed_up_to < chunk_end {
last_ccc = 0;
processed_up_to = chunk_end;
}
pos = chunk_end;
}
let tail_start = processed_up_to.max(pos);
if tail_start > processed_up_to {
last_ccc = 0;
}
let mut tail_pos = tail_start;
while tail_pos < len {
let b = bytes[tail_pos];
if b < 0x80 {
last_ccc = 0;
tail_pos += 1;
continue;
}
if utf8::is_continuation_byte(b) {
tail_pos += 1;
continue;
}
let (ch, width) = utf8::decode_char_at(bytes, tail_pos);
let cp = ch as u32;
if cp < safe_below
|| is_cjk_unified(cp)
|| (hangul_safe && (0xAC00..=0xD7A3).contains(&cp))
|| (cp >= 0x10000 && is_supp_safe(cp))
{
last_ccc = 0;
tail_pos += width;
continue;
}
let (ccc, qc) = tables::lookup_ccc_qc(ch, qc_shift);
if ccc != 0 && last_ccc > ccc {
return IsNormalized::No;
}
match qc_value_to_result(qc) {
IsNormalized::No => return IsNormalized::No,
IsNormalized::Maybe => result = IsNormalized::Maybe,
IsNormalized::Yes => {},
}
last_ccc = ccc;
tail_pos += width;
}
result
}
#[cfg(feature = "quick_check_oracle")]
pub fn quick_check_nfc_oracle(input: &str) -> IsNormalized {
quick_check_impl_oracle(input, tables::CCC_QC_NFC_SHIFT, 0xCC, 0x0300, true, true)
}
#[cfg(feature = "quick_check_oracle")]
pub fn quick_check_nfd_oracle(input: &str) -> IsNormalized {
quick_check_impl_oracle(input, tables::CCC_QC_NFD_SHIFT, 0xC3, 0x00C0, false, false)
}
#[cfg(feature = "quick_check_oracle")]
pub fn quick_check_nfkc_oracle(input: &str) -> IsNormalized {
quick_check_impl_oracle(input, tables::CCC_QC_NFKC_SHIFT, 0xC0, 0x00A0, true, true)
}
#[cfg(feature = "quick_check_oracle")]
pub fn quick_check_nfkd_oracle(input: &str) -> IsNormalized {
quick_check_impl_oracle(input, tables::CCC_QC_NFKD_SHIFT, 0xC0, 0x00A0, false, false)
}
pub(crate) fn is_normalized_nfc(input: &str) -> bool {
match quick_check_nfc(input) {
IsNormalized::Yes => true,
IsNormalized::No => false,
IsNormalized::Maybe => &*crate::nfc().normalize(input) == input,
}
}
pub(crate) fn is_normalized_nfd(input: &str) -> bool {
match quick_check_nfd(input) {
IsNormalized::Yes => true,
IsNormalized::No => false,
IsNormalized::Maybe => &*crate::nfd().normalize(input) == input,
}
}
pub(crate) fn is_normalized_nfkc(input: &str) -> bool {
match quick_check_nfkc(input) {
IsNormalized::Yes => true,
IsNormalized::No => false,
IsNormalized::Maybe => &*crate::nfkc().normalize(input) == input,
}
}
pub(crate) fn is_normalized_nfkd(input: &str) -> bool {
match quick_check_nfkd(input) {
IsNormalized::Yes => true,
IsNormalized::No => false,
IsNormalized::Maybe => &*crate::nfkd().normalize(input) == input,
}
}
#[cfg(test)]
mod tests {
use super::*;
use alloc::format;
use alloc::string::String;
#[test]
fn ascii_is_nfc() {
assert_eq!(quick_check_nfc("Hello, world!"), IsNormalized::Yes);
}
#[test]
fn ascii_is_nfd() {
assert_eq!(quick_check_nfd("Hello, world!"), IsNormalized::Yes);
}
#[test]
fn ascii_is_nfkc() {
assert_eq!(quick_check_nfkc("Hello, world!"), IsNormalized::Yes);
}
#[test]
fn ascii_is_nfkd() {
assert_eq!(quick_check_nfkd("Hello, world!"), IsNormalized::Yes);
}
#[test]
fn empty_string_is_normalized() {
assert_eq!(quick_check_nfc(""), IsNormalized::Yes);
assert_eq!(quick_check_nfd(""), IsNormalized::Yes);
assert_eq!(quick_check_nfkc(""), IsNormalized::Yes);
assert_eq!(quick_check_nfkd(""), IsNormalized::Yes);
}
#[test]
fn precomposed_is_nfc_yes() {
assert_eq!(quick_check_nfc("\u{00E9}"), IsNormalized::Yes);
}
#[test]
fn decomposed_is_not_nfc() {
let nfd = "e\u{0301}";
let result = quick_check_nfc(nfd);
assert!(
result == IsNormalized::No || result == IsNormalized::Maybe,
"NFD form must not be Yes for NFC, got {:?}",
result,
);
}
#[test]
fn precomposed_is_not_nfd() {
assert_eq!(quick_check_nfd("\u{00E9}"), IsNormalized::No);
}
#[test]
fn wrong_ccc_order_is_no() {
let bad_order = "a\u{0301}\u{0327}"; assert_eq!(quick_check_nfc(bad_order), IsNormalized::No);
assert_eq!(quick_check_nfd(bad_order), IsNormalized::No);
}
#[test]
fn correct_ccc_order_not_rejected() {
let good_order = "a\u{0591}\u{05A1}";
let result = quick_check_nfc(good_order);
assert_ne!(result, IsNormalized::No);
}
#[test]
fn latin1_supplement_is_nfc() {
let latin1 = "\u{00C0}\u{00E9}\u{00F6}\u{00FC}\u{00FF}";
assert_eq!(quick_check_nfc(latin1), IsNormalized::Yes);
}
#[test]
fn latin_extended_is_nfc() {
let extended = "\u{0100}\u{017E}\u{0250}\u{02FF}";
assert_eq!(quick_check_nfc(extended), IsNormalized::Yes);
}
#[test]
fn cjk_is_nfc() {
let cjk = "\u{4E00}\u{9FFF}\u{3400}\u{4DBF}";
assert_eq!(quick_check_nfc(cjk), IsNormalized::Yes);
}
#[test]
fn hangul_syllable_is_nfc() {
let hangul = "\u{AC00}\u{D7A3}";
assert_eq!(quick_check_nfc(hangul), IsNormalized::Yes);
}
#[test]
fn hangul_syllable_is_not_nfd() {
let hangul = "\u{AC00}";
assert_eq!(quick_check_nfd(hangul), IsNormalized::No);
}
#[test]
fn latin1_is_not_nfd() {
assert_eq!(quick_check_nfd("\u{00C0}"), IsNormalized::No);
}
#[test]
fn nbsp_is_not_nfkc() {
assert_eq!(quick_check_nfkc("\u{00A0}"), IsNormalized::No);
}
#[test]
fn is_normalized_nfc_ascii() {
assert!(is_normalized_nfc("Hello"));
}
#[test]
fn is_normalized_nfc_precomposed() {
assert!(is_normalized_nfc("\u{00E9}"));
}
#[test]
fn is_normalized_nfd_decomposed() {
assert!(is_normalized_nfd("e\u{0301}"));
}
#[test]
fn is_normalized_nfc_rejects_nfd() {
assert!(!is_normalized_nfc("e\u{0301}"));
}
#[test]
fn is_normalized_nfd_rejects_nfc() {
assert!(!is_normalized_nfd("\u{00E9}"));
}
#[test]
fn safe_lead_interleaved_with_combining_marks_across_chunk() {
let unit = "\u{4E2D}a\u{0591}bb";
let s: String = unit.repeat(16);
assert_eq!(s.len(), 128);
assert_eq!(quick_check_nfc(&s), IsNormalized::Yes);
assert_eq!(quick_check_nfd(&s), IsNormalized::Yes);
assert_eq!(quick_check_nfkc(&s), IsNormalized::Yes);
assert_eq!(quick_check_nfkd(&s), IsNormalized::Yes);
}
#[test]
fn safe_lead_then_out_of_order_combining_is_no() {
let unit = "\u{4E2D}a\u{0301}\u{0327}"; let padding = "x".repeat(64); let s = format!("{}{}", padding, unit);
assert!(s.len() >= 64);
assert_eq!(quick_check_nfc(&s), IsNormalized::No);
}
#[cfg(feature = "quick_check_oracle")]
#[test]
fn oracle_matches_fastpath_on_fixed_input() {
let s = "\u{4E2D}a\u{0591}bb".repeat(16);
assert_eq!(quick_check_nfc(&s), super::quick_check_nfc_oracle(&s));
assert_eq!(quick_check_nfd(&s), super::quick_check_nfd_oracle(&s));
assert_eq!(quick_check_nfkc(&s), super::quick_check_nfkc_oracle(&s));
assert_eq!(quick_check_nfkd(&s), super::quick_check_nfkd_oracle(&s));
}
}