extern crate alloc;
use alloc::borrow::Cow;
use alloc::string::String;
use simd_normalizer::normalizer::{NfcNormalizer, NfdNormalizer, NfkcNormalizer, NfkdNormalizer};
fn icu_nfc(s: &str) -> String {
use icu_normalizer::ComposingNormalizerBorrowed;
ComposingNormalizerBorrowed::new_nfc()
.normalize(s)
.into_owned()
}
fn icu_nfd(s: &str) -> String {
use icu_normalizer::DecomposingNormalizerBorrowed;
DecomposingNormalizerBorrowed::new_nfd()
.normalize(s)
.into_owned()
}
fn icu_nfkc(s: &str) -> String {
use icu_normalizer::ComposingNormalizerBorrowed;
ComposingNormalizerBorrowed::new_nfkc()
.normalize(s)
.into_owned()
}
fn icu_nfkd(s: &str) -> String {
use icu_normalizer::DecomposingNormalizerBorrowed;
DecomposingNormalizerBorrowed::new_nfkd()
.normalize(s)
.into_owned()
}
#[allow(clippy::ptr_arg)]
fn is_borrowed(cow: &Cow<'_, str>) -> bool {
matches!(cow, Cow::Borrowed(_))
}
fn assert_all_forms_match_icu(input: &str) {
let nfc_result = NfcNormalizer.normalize(input);
let nfd_result = NfdNormalizer.normalize(input);
let nfkc_result = NfkcNormalizer.normalize(input);
let nfkd_result = NfkdNormalizer.normalize(input);
let icu_nfc_result = icu_nfc(input);
let icu_nfd_result = icu_nfd(input);
let icu_nfkc_result = icu_nfkc(input);
let icu_nfkd_result = icu_nfkd(input);
assert_eq!(
&*nfc_result,
&icu_nfc_result,
"NFC mismatch for input of {} bytes",
input.len()
);
assert_eq!(
&*nfd_result,
&icu_nfd_result,
"NFD mismatch for input of {} bytes",
input.len()
);
assert_eq!(
&*nfkc_result,
&icu_nfkc_result,
"NFKC mismatch for input of {} bytes",
input.len()
);
assert_eq!(
&*nfkd_result,
&icu_nfkd_result,
"NFKD mismatch for input of {} bytes",
input.len()
);
}
fn ascii_bytes(n: usize) -> String {
let pattern = b"abcdefghijklmnopqrstuvwxyz0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ_-";
let mut s = String::with_capacity(n);
for i in 0..n {
s.push(pattern[i % pattern.len()] as char);
}
assert_eq!(s.len(), n);
s
}
#[test]
fn exactly_64_ascii_bytes_all_forms() {
let input = ascii_bytes(64);
assert_eq!(input.len(), 64);
let nfc = NfcNormalizer.normalize(&input);
let nfd = NfdNormalizer.normalize(&input);
let nfkc = NfkcNormalizer.normalize(&input);
let nfkd = NfkdNormalizer.normalize(&input);
assert!(is_borrowed(&nfc), "NFC should borrow 64-byte ASCII");
assert!(is_borrowed(&nfd), "NFD should borrow 64-byte ASCII");
assert!(is_borrowed(&nfkc), "NFKC should borrow 64-byte ASCII");
assert!(is_borrowed(&nfkd), "NFKD should borrow 64-byte ASCII");
assert_eq!(&*nfc, &input);
assert_eq!(&*nfd, &input);
assert_eq!(&*nfkc, &input);
assert_eq!(&*nfkd, &input);
assert_all_forms_match_icu(&input);
}
#[test]
fn exactly_128_ascii_bytes_all_forms() {
let input = ascii_bytes(128);
assert_eq!(input.len(), 128);
let nfc = NfcNormalizer.normalize(&input);
let nfd = NfdNormalizer.normalize(&input);
let nfkc = NfkcNormalizer.normalize(&input);
let nfkd = NfkdNormalizer.normalize(&input);
assert!(is_borrowed(&nfc), "NFC should borrow 128-byte ASCII");
assert!(is_borrowed(&nfd), "NFD should borrow 128-byte ASCII");
assert!(is_borrowed(&nfkc), "NFKC should borrow 128-byte ASCII");
assert!(is_borrowed(&nfkd), "NFKD should borrow 128-byte ASCII");
assert_eq!(&*nfc, &input);
assert_eq!(&*nfd, &input);
assert_eq!(&*nfkc, &input);
assert_eq!(&*nfkd, &input);
assert_all_forms_match_icu(&input);
}
#[test]
fn ascii_63_plus_two_byte_char_straddling_64() {
let prefix = ascii_bytes(63);
let input = format!("{}\u{00E9}", prefix);
assert_eq!(input.len(), 65, "63 ASCII + 2-byte char = 65 bytes");
assert_all_forms_match_icu(&input);
}
#[test]
fn ascii_63_plus_two_byte_char_nfc_stable() {
let prefix = ascii_bytes(63);
let input = format!("{}\u{00FC}", prefix);
assert_eq!(input.len(), 65);
let nfc = NfcNormalizer.normalize(&input);
assert_eq!(&*nfc, &input, "NFC should be identity for precomposed char");
assert_all_forms_match_icu(&input);
}
#[test]
fn ascii_64_plus_non_ascii_trailing() {
let prefix = ascii_bytes(64);
let input = format!("{}\u{00E9}", prefix);
assert_eq!(input.len(), 66);
assert_all_forms_match_icu(&input);
let input_cjk = format!("{}\u{4E00}", prefix);
assert_eq!(input_cjk.len(), 67);
assert_all_forms_match_icu(&input_cjk);
let input_emoji = format!("{}\u{1F600}", prefix);
assert_eq!(input_emoji.len(), 68);
assert_all_forms_match_icu(&input_emoji);
}
#[test]
fn ascii_64_plus_combining_sequence_trailing() {
let prefix = ascii_bytes(64);
let input = format!("{}A\u{0300}\u{0301}", prefix);
assert_eq!(input.len(), 69);
assert_all_forms_match_icu(&input);
}
#[test]
fn cjk_3byte_straddling_64_boundary() {
let prefix = ascii_bytes(62);
let input = format!("{}\u{4E00}", prefix);
assert_eq!(input.len(), 65, "62 ASCII + 3-byte CJK = 65 bytes");
assert_all_forms_match_icu(&input);
}
#[test]
fn emoji_4byte_straddling_64_boundary() {
let prefix = ascii_bytes(61);
let input = format!("{}\u{1F600}", prefix);
assert_eq!(input.len(), 65, "61 ASCII + 4-byte emoji = 65 bytes");
assert_all_forms_match_icu(&input);
}
#[test]
fn combining_sequence_starting_at_byte_60() {
let prefix = ascii_bytes(60);
let input = format!("{}A\u{0300}\u{0301}", prefix);
assert_eq!(input.len(), 65, "60 + 1 + 2 + 2 = 65 bytes");
assert_all_forms_match_icu(&input);
}
#[test]
fn combining_sequence_starting_at_byte_62() {
let prefix = ascii_bytes(62);
let input = format!("{}A\u{0300}", prefix);
assert_eq!(input.len(), 65, "62 + 1 + 2 = 65 bytes");
assert_all_forms_match_icu(&input);
}
#[test]
fn cjk_3byte_straddling_128_boundary() {
let prefix = ascii_bytes(126);
let input = format!("{}\u{4E00}", prefix);
assert_eq!(input.len(), 129, "126 ASCII + 3-byte CJK = 129 bytes");
assert_all_forms_match_icu(&input);
}
#[test]
fn emoji_4byte_straddling_128_boundary() {
let prefix = ascii_bytes(125);
let input = format!("{}\u{1F600}", prefix);
assert_eq!(input.len(), 129, "125 ASCII + 4-byte emoji = 129 bytes");
assert_all_forms_match_icu(&input);
}
#[test]
fn two_byte_char_straddling_128_boundary() {
let prefix = ascii_bytes(127);
let input = format!("{}\u{00E9}", prefix);
assert_eq!(input.len(), 129, "127 ASCII + 2-byte char = 129 bytes");
assert_all_forms_match_icu(&input);
}
#[test]
fn combining_sequence_straddling_128_boundary() {
let prefix = ascii_bytes(126);
let input = format!("{}A\u{0300}", prefix);
assert_eq!(input.len(), 129, "126 + 1 + 2 = 129 bytes");
assert_all_forms_match_icu(&input);
}
#[test]
fn a_ring_decomposes_across_64_boundary() {
let prefix = ascii_bytes(62);
let input = format!("{}\u{00C5}", prefix);
assert_eq!(input.len(), 64, "62 ASCII + 2-byte A-ring = 64 bytes");
assert_all_forms_match_icu(&input);
}
#[test]
fn a_ring_plus_combining_marks_in_next_chunk() {
let prefix = ascii_bytes(62);
let input = format!("{}\u{00C5}\u{0301}\u{0327}", prefix);
assert_eq!(input.len(), 68, "62 + 2 + 2 + 2 = 68 bytes");
assert_all_forms_match_icu(&input);
}
#[test]
fn starter_before_64_combining_after_64() {
let prefix = ascii_bytes(63);
let input = format!("{}A\u{0300}\u{0301}", prefix);
assert_eq!(input.len(), 68, "63 + 1 + 2 + 2 = 68 bytes");
assert_all_forms_match_icu(&input);
}
#[test]
fn starter_at_byte_63_long_combining_run() {
let prefix = ascii_bytes(63);
let input = format!("{}o\u{0303}\u{0304}\u{0323}", prefix);
assert_eq!(input.len(), 70, "63 + 1 + 2 + 2 + 2 = 70 bytes");
assert_all_forms_match_icu(&input);
}
#[test]
fn decomposing_char_at_boundary_with_trailing_combiners() {
let prefix = ascii_bytes(62);
let input = format!("{}\u{01FA}", prefix);
assert_eq!(input.len(), 64, "62 + 2 = 64 bytes");
assert_all_forms_match_icu(&input);
}
#[test]
fn hangul_syllable_straddling_64_boundary() {
let prefix = ascii_bytes(62);
let input = format!("{}\u{AC00}", prefix);
assert_eq!(input.len(), 65, "62 + 3 = 65 bytes");
assert_all_forms_match_icu(&input);
}
#[test]
fn large_1mb_repeating_pattern() {
let ascii_part = ascii_bytes(50);
let pattern = format!("{}{}{}", ascii_part, "日本語", "\u{1F600}");
assert_eq!(pattern.len(), 63, "50 + 9 + 4 = 63 bytes per cycle");
let target_size = 1024 * 1024; let repeats = target_size / pattern.len() + 1;
let large_input: String = pattern.repeat(repeats);
assert!(
large_input.len() >= target_size,
"input should be >= 1MB, got {} bytes",
large_input.len()
);
let our_nfc = NfcNormalizer.normalize(&large_input);
let our_nfd = NfdNormalizer.normalize(&large_input);
let our_nfkc = NfkcNormalizer.normalize(&large_input);
let our_nfkd = NfkdNormalizer.normalize(&large_input);
let ref_nfc = icu_nfc(&large_input);
let ref_nfd = icu_nfd(&large_input);
let ref_nfkc = icu_nfkc(&large_input);
let ref_nfkd = icu_nfkd(&large_input);
assert_eq!(
&*our_nfc,
&ref_nfc,
"NFC mismatch on 1MB input ({} bytes)",
large_input.len()
);
assert_eq!(
&*our_nfd,
&ref_nfd,
"NFD mismatch on 1MB input ({} bytes)",
large_input.len()
);
assert_eq!(
&*our_nfkc,
&ref_nfkc,
"NFKC mismatch on 1MB input ({} bytes)",
large_input.len()
);
assert_eq!(
&*our_nfkd,
&ref_nfkd,
"NFKD mismatch on 1MB input ({} bytes)",
large_input.len()
);
}
#[test]
fn boundary_sweep_ascii_around_64() {
for len in 60..=68 {
let input = ascii_bytes(len);
let nfc = NfcNormalizer.normalize(&input);
let nfd = NfdNormalizer.normalize(&input);
let nfkc = NfkcNormalizer.normalize(&input);
let nfkd = NfkdNormalizer.normalize(&input);
assert!(is_borrowed(&nfc), "NFC should borrow {}-byte ASCII", len);
assert!(is_borrowed(&nfd), "NFD should borrow {}-byte ASCII", len);
assert!(is_borrowed(&nfkc), "NFKC should borrow {}-byte ASCII", len);
assert!(is_borrowed(&nfkd), "NFKD should borrow {}-byte ASCII", len);
assert_eq!(&*nfc, &input);
assert_all_forms_match_icu(&input);
}
}
#[test]
fn boundary_sweep_ascii_around_128() {
for len in 124..=132 {
let input = ascii_bytes(len);
let nfc = NfcNormalizer.normalize(&input);
let nfd = NfdNormalizer.normalize(&input);
let nfkc = NfkcNormalizer.normalize(&input);
let nfkd = NfkdNormalizer.normalize(&input);
assert!(is_borrowed(&nfc), "NFC should borrow {}-byte ASCII", len);
assert!(is_borrowed(&nfd), "NFD should borrow {}-byte ASCII", len);
assert!(is_borrowed(&nfkc), "NFKC should borrow {}-byte ASCII", len);
assert!(is_borrowed(&nfkd), "NFKD should borrow {}-byte ASCII", len);
assert_eq!(&*nfc, &input);
assert_all_forms_match_icu(&input);
}
}
#[test]
fn boundary_sweep_with_trailing_non_ascii_around_64() {
for ascii_len in 59..=65 {
let input_2 = format!("{}\u{00E9}", ascii_bytes(ascii_len));
assert_all_forms_match_icu(&input_2);
let input_3 = format!("{}\u{4E00}", ascii_bytes(ascii_len));
assert_all_forms_match_icu(&input_3);
let input_4 = format!("{}\u{1F600}", ascii_bytes(ascii_len));
assert_all_forms_match_icu(&input_4);
}
}
#[test]
fn boundary_sweep_with_trailing_non_ascii_around_128() {
for ascii_len in 123..=129 {
let input_2 = format!("{}\u{00E9}", ascii_bytes(ascii_len));
assert_all_forms_match_icu(&input_2);
let input_3 = format!("{}\u{4E00}", ascii_bytes(ascii_len));
assert_all_forms_match_icu(&input_3);
let input_4 = format!("{}\u{1F600}", ascii_bytes(ascii_len));
assert_all_forms_match_icu(&input_4);
}
}
#[test]
fn exact_64_bytes_ending_with_multibyte() {
let input_2 = format!("{}\u{00E9}", ascii_bytes(62));
assert_eq!(input_2.len(), 64);
assert_all_forms_match_icu(&input_2);
let input_3 = format!("{}\u{4E00}", ascii_bytes(61));
assert_eq!(input_3.len(), 64);
assert_all_forms_match_icu(&input_3);
let input_4 = format!("{}\u{1F600}", ascii_bytes(60));
assert_eq!(input_4.len(), 64);
assert_all_forms_match_icu(&input_4);
}
#[test]
fn exact_128_bytes_ending_with_multibyte() {
let input_2 = format!("{}\u{00E9}", ascii_bytes(126));
assert_eq!(input_2.len(), 128);
assert_all_forms_match_icu(&input_2);
let input_3 = format!("{}\u{4E00}", ascii_bytes(125));
assert_eq!(input_3.len(), 128);
assert_all_forms_match_icu(&input_3);
let input_4 = format!("{}\u{1F600}", ascii_bytes(124));
assert_eq!(input_4.len(), 128);
assert_all_forms_match_icu(&input_4);
}
#[test]
fn combining_sequence_split_exactly_at_64() {
let prefix = ascii_bytes(63);
let input = format!("{}e\u{0301}", prefix);
assert_eq!(input.len(), 66, "63 + 1 + 2 = 66 bytes");
assert_all_forms_match_icu(&input);
let nfc = NfcNormalizer.normalize(&input);
let expected_nfc = format!("{}\u{00E9}", prefix);
assert_eq!(
&*nfc, &expected_nfc,
"NFC should compose e + acute to e-acute"
);
}
#[test]
fn combining_sequence_split_exactly_at_128() {
let prefix = ascii_bytes(127);
let input = format!("{}e\u{0301}", prefix);
assert_eq!(input.len(), 130, "127 + 1 + 2 = 130 bytes");
assert_all_forms_match_icu(&input);
let nfc = NfcNormalizer.normalize(&input);
let expected_nfc = format!("{}\u{00E9}", prefix);
assert_eq!(&*nfc, &expected_nfc);
}
#[test]
fn multiple_combining_marks_crossing_boundary() {
let prefix = ascii_bytes(61);
let input = format!("{}a\u{0308}\u{0304}\u{0323}", prefix);
assert_eq!(input.len(), 68, "61 + 1 + 2 + 2 + 2 = 68 bytes");
assert_all_forms_match_icu(&input);
}
#[test]
fn non_ascii_at_every_position_near_64() {
for pos in 60..=67 {
let mut input = ascii_bytes(pos);
input.push('\u{00C0}');
while input.len() < 70 {
input.push('z');
}
assert_all_forms_match_icu(&input);
}
}
#[test]
fn non_ascii_at_every_position_near_128() {
for pos in 124..=131 {
let mut input = ascii_bytes(pos);
input.push('\u{00C0}');
while input.len() < 134 {
input.push('z');
}
assert_all_forms_match_icu(&input);
}
}
#[test]
fn three_chunks_with_boundary_straddling() {
let mut input = ascii_bytes(62);
input.push('\u{00C5}'); while input.len() < 126 {
input.push('x');
}
input.push('\u{4E00}'); while input.len() < 189 {
input.push('y');
}
input.push('\u{1F600}'); input.push_str("tail");
assert_all_forms_match_icu(&input);
}
#[test]
fn nfc_stable_input_at_boundary_returns_borrowed() {
let input = format!("{}\u{4E00}", ascii_bytes(61));
assert_eq!(input.len(), 64);
let nfc = NfcNormalizer.normalize(&input);
assert!(
is_borrowed(&nfc),
"NFC should return Borrowed for NFC-stable 64-byte input"
);
let nfkc = NfkcNormalizer.normalize(&input);
assert!(
is_borrowed(&nfkc),
"NFKC should return Borrowed for NFKC-stable 64-byte input"
);
}
#[test]
fn nfd_stable_input_at_boundary_returns_borrowed() {
let input = format!("{}A\u{030A}z", ascii_bytes(60));
assert_eq!(input.len(), 64);
let nfd = NfdNormalizer.normalize(&input);
assert!(
is_borrowed(&nfd),
"NFD should return Borrowed for already-decomposed 64-byte input"
);
}
#[test]
fn large_varied_pattern_1mb() {
let ascii_37 = ascii_bytes(37);
let ascii_13 = ascii_bytes(13);
let pattern = format!("{}\u{00E9}{}\u{4E00}\u{1F600}\u{0301}", ascii_37, ascii_13);
assert_eq!(pattern.len(), 61);
let target_size = 1024 * 1024;
let repeats = target_size / pattern.len() + 1;
let large_input: String = pattern.repeat(repeats);
let our_nfc = NfcNormalizer.normalize(&large_input).into_owned();
let our_nfd = NfdNormalizer.normalize(&large_input).into_owned();
let our_nfkc = NfkcNormalizer.normalize(&large_input).into_owned();
let our_nfkd = NfkdNormalizer.normalize(&large_input).into_owned();
let ref_nfc = icu_nfc(&large_input);
let ref_nfd = icu_nfd(&large_input);
let ref_nfkc = icu_nfkc(&large_input);
let ref_nfkd = icu_nfkd(&large_input);
assert_eq!(&our_nfc, &ref_nfc, "NFC mismatch on large varied input");
assert_eq!(&our_nfd, &ref_nfd, "NFD mismatch on large varied input");
assert_eq!(&our_nfkc, &ref_nfkc, "NFKC mismatch on large varied input");
assert_eq!(&our_nfkd, &ref_nfkd, "NFKD mismatch on large varied input");
}