#![cfg_attr(not(any(test, doc)), no_std)]
#![cfg_attr(
not(test),
deny(
clippy::indexing_slicing,
clippy::unwrap_used,
clippy::expect_used,
clippy::panic,
)
)]
#![warn(missing_docs)]
extern crate alloc;
#[cfg(not(icu4x_unstable_fast_trie_only))]
type Trie<'trie> = CodePointTrie<'trie, u32>;
#[cfg(icu4x_unstable_fast_trie_only)]
type Trie<'trie> = FastCodePointTrie<'trie, u32>;
macro_rules! ccc {
($name:ident, $num:expr) => {
const {
#[cfg(feature = "icu_properties")]
if icu_properties::props::CanonicalCombiningClass::$name.to_icu4c_value() != $num {
panic!("icu_normalizer has incorrect ccc values")
}
CanonicalCombiningClass::from_icu4c_value($num)
}
};
}
#[cfg(feature = "harfbuzz_traits")]
mod harfbuzz;
pub mod properties;
pub mod provider;
pub mod uts46;
use crate::provider::CanonicalCompositions;
use crate::provider::DecompositionData;
use crate::provider::NormalizerNfdDataV1;
use crate::provider::NormalizerNfkdDataV1;
use crate::provider::NormalizerUts46DataV1;
use alloc::borrow::Cow;
use alloc::string::String;
use core::char::REPLACEMENT_CHARACTER;
use icu_collections::char16trie::Char16Trie;
use icu_collections::char16trie::Char16TrieIterator;
use icu_collections::char16trie::TrieResult;
#[cfg(not(icu4x_unstable_fast_trie_only))]
use icu_collections::codepointtrie::CodePointTrie;
#[cfg(icu4x_unstable_fast_trie_only)]
use icu_collections::codepointtrie::FastCodePointTrie;
#[cfg(icu4x_unstable_fast_trie_only)]
use icu_collections::codepointtrie::TypedCodePointTrie;
#[cfg(feature = "icu_properties")]
use icu_properties::props::CanonicalCombiningClass;
use icu_provider::prelude::*;
use provider::DecompositionTables;
use provider::NormalizerNfcV1;
use provider::NormalizerNfdTablesV1;
use provider::NormalizerNfkdTablesV1;
use smallvec::SmallVec;
#[cfg(feature = "utf16_iter")]
use utf16_iter::Utf16CharsEx;
#[cfg(feature = "utf8_iter")]
use utf8_iter::Utf8CharsEx;
use zerovec::{zeroslice, ZeroSlice};
#[cfg(all(icu4x_unstable_fast_trie_only, feature = "utf16_iter"))]
#[inline(always)]
#[cold]
fn cold_path() {}
#[cfg(all(icu4x_unstable_fast_trie_only, feature = "utf16_iter"))]
#[inline(always)]
pub(crate) fn likely(b: bool) -> bool {
if b {
true
} else {
cold_path();
false
}
}
#[cfg(all(not(icu4x_unstable_fast_trie_only), feature = "utf16_iter"))]
#[inline(always)]
fn likely(b: bool) -> bool {
b
}
#[cfg(not(feature = "icu_properties"))]
#[derive(Copy, Clone, Eq, PartialEq, PartialOrd, Ord)]
struct CanonicalCombiningClass(pub(crate) u8);
#[cfg(not(feature = "icu_properties"))]
impl CanonicalCombiningClass {
const fn from_icu4c_value(v: u8) -> Self {
Self(v)
}
const fn to_icu4c_value(self) -> u8 {
self.0
}
}
const CCC_NOT_REORDERED: CanonicalCombiningClass = ccc!(NotReordered, 0);
const CCC_ABOVE: CanonicalCombiningClass = ccc!(Above, 230);
#[derive(Debug, PartialEq, Eq)]
enum IgnorableBehavior {
Unsupported,
Ignored,
ReplacementCharacter,
}
const IGNORABLE_MARKER: u32 = 0xFFFFFFFF;
const NON_ROUND_TRIP_MARKER: u32 = 1 << 30;
const BACKWARD_COMBINING_MARKER: u32 = 1 << 31;
const HIGH_ZEROS_MASK: u32 = 0x3FFF0000;
const LOW_ZEROS_MASK: u32 = 0xFFE0;
#[inline]
fn trie_value_has_ccc(trie_value: u32) -> bool {
(trie_value & 0x3FFFFE00) == 0xD800
}
fn trie_value_indicates_special_non_starter_decomposition(trie_value: u32) -> bool {
(trie_value & 0x3FFFFF00) == 0xD900
}
fn decomposition_starts_with_non_starter(trie_value: u32) -> bool {
trie_value_has_ccc(trie_value)
}
fn ccc_from_trie_value(trie_value: u32) -> CanonicalCombiningClass {
if trie_value_has_ccc(trie_value) {
CanonicalCombiningClass::from_icu4c_value(trie_value as u8)
} else {
CCC_NOT_REORDERED
}
}
static FDFA_NFKD: [u16; 17] = [
0x644, 0x649, 0x20, 0x627, 0x644, 0x644, 0x647, 0x20, 0x639, 0x644, 0x64A, 0x647, 0x20, 0x648,
0x633, 0x644, 0x645,
];
const FDFA_MARKER: u16 = 1;
const HANGUL_S_BASE: u32 = 0xAC00;
const HANGUL_L_BASE: u32 = 0x1100;
const HANGUL_V_BASE: u32 = 0x1161;
const HANGUL_T_BASE: u32 = 0x11A7;
const HANGUL_L_COUNT: u32 = 19;
const HANGUL_V_COUNT: u32 = 21;
const HANGUL_T_COUNT: u32 = 28;
const HANGUL_N_COUNT: u32 = 588;
const HANGUL_S_COUNT: u32 = 11172;
const HANGUL_JAMO_LIMIT: u32 = 0x1200;
#[inline(always)]
fn unwrap_or_gigo<T>(opt: Option<T>, default: T) -> T {
if let Some(val) = opt {
val
} else {
debug_assert!(false);
default
}
}
#[inline(always)]
fn char_from_u32(u: u32) -> char {
unwrap_or_gigo(core::char::from_u32(u), REPLACEMENT_CHARACTER)
}
#[inline(always)]
fn char_from_u16(u: u16) -> char {
char_from_u32(u32::from(u))
}
const EMPTY_U16: &ZeroSlice<u16> = zeroslice![];
const EMPTY_CHAR: &ZeroSlice<char> = zeroslice![];
#[inline(always)]
fn in_inclusive_range(c: char, start: char, end: char) -> bool {
u32::from(c).wrapping_sub(u32::from(start)) <= (u32::from(end) - u32::from(start))
}
#[inline(always)]
#[cfg(feature = "utf16_iter")]
fn in_inclusive_range16(u: u16, start: u16, end: u16) -> bool {
u.wrapping_sub(start) <= (end - start)
}
#[inline]
fn compose(iter: Char16TrieIterator, starter: char, second: char) -> Option<char> {
let v = u32::from(second).wrapping_sub(HANGUL_V_BASE);
if v >= HANGUL_JAMO_LIMIT - HANGUL_V_BASE {
return compose_non_hangul(iter, starter, second);
}
if v < HANGUL_V_COUNT {
let l = u32::from(starter).wrapping_sub(HANGUL_L_BASE);
if l < HANGUL_L_COUNT {
let lv = l * HANGUL_N_COUNT + v * HANGUL_T_COUNT;
return Some(unsafe { char::from_u32_unchecked(HANGUL_S_BASE + lv) });
}
return None;
}
if in_inclusive_range(second, '\u{11A8}', '\u{11C2}') {
let lv = u32::from(starter).wrapping_sub(HANGUL_S_BASE);
if lv < HANGUL_S_COUNT && lv % HANGUL_T_COUNT == 0 {
let lvt = lv + (u32::from(second) - HANGUL_T_BASE);
return Some(unsafe { char::from_u32_unchecked(HANGUL_S_BASE + lvt) });
}
}
None
}
fn compose_non_hangul(mut iter: Char16TrieIterator, starter: char, second: char) -> Option<char> {
match iter.next(second) {
TrieResult::NoMatch => None,
TrieResult::NoValue => match iter.next(starter) {
TrieResult::NoMatch => None,
TrieResult::FinalValue(i) => {
if let Some(c) = char::from_u32(i as u32) {
Some(c)
} else {
debug_assert!(false);
None
}
}
TrieResult::NoValue | TrieResult::Intermediate(_) => {
debug_assert!(false);
None
}
},
TrieResult::FinalValue(_) | TrieResult::Intermediate(_) => {
debug_assert!(false);
None
}
}
}
#[inline(always)]
fn starter_and_decomposes_to_self_impl(trie_val: u32) -> bool {
(trie_val & !(BACKWARD_COMBINING_MARKER | NON_ROUND_TRIP_MARKER)) == 0
}
#[inline(always)]
fn potential_passthrough_and_cannot_combine_backwards_impl(trie_val: u32) -> bool {
(trie_val & (NON_ROUND_TRIP_MARKER | BACKWARD_COMBINING_MARKER)) == 0
}
#[derive(Debug, PartialEq, Eq)]
struct CharacterAndTrieValue {
character: char,
trie_val: u32,
}
impl CharacterAndTrieValue {
#[inline(always)]
pub fn new(c: char, trie_value: u32) -> Self {
CharacterAndTrieValue {
character: c,
trie_val: trie_value,
}
}
#[inline(always)]
pub fn starter_and_decomposes_to_self(&self) -> bool {
starter_and_decomposes_to_self_impl(self.trie_val)
}
#[inline(always)]
#[cfg(feature = "utf8_iter")]
pub fn starter_and_decomposes_to_self_except_replacement(&self) -> bool {
(self.trie_val & !BACKWARD_COMBINING_MARKER) == 0
}
#[inline(always)]
pub fn can_combine_backwards(&self) -> bool {
(self.trie_val & BACKWARD_COMBINING_MARKER) != 0
}
#[inline(always)]
pub fn potential_passthrough(&self) -> bool {
(self.trie_val & NON_ROUND_TRIP_MARKER) == 0
}
#[inline(always)]
pub fn potential_passthrough_and_cannot_combine_backwards(&self) -> bool {
potential_passthrough_and_cannot_combine_backwards_impl(self.trie_val)
}
}
#[derive(Debug)]
struct CharacterAndClass(u32);
impl CharacterAndClass {
pub fn new(c: char, ccc: CanonicalCombiningClass) -> Self {
CharacterAndClass(u32::from(c) | (u32::from(ccc.to_icu4c_value()) << 24))
}
pub fn new_with_placeholder(c: char) -> Self {
CharacterAndClass(u32::from(c) | ((0xFF) << 24))
}
pub fn new_with_trie_value(c_tv: CharacterAndTrieValue) -> Self {
Self::new(c_tv.character, ccc_from_trie_value(c_tv.trie_val))
}
pub fn new_starter(c: char) -> Self {
CharacterAndClass(u32::from(c))
}
pub fn character(&self) -> char {
unsafe { char::from_u32_unchecked(self.0 & 0xFFFFFF) }
}
pub fn ccc(&self) -> CanonicalCombiningClass {
CanonicalCombiningClass::from_icu4c_value((self.0 >> 24) as u8)
}
pub fn character_and_ccc(&self) -> (char, CanonicalCombiningClass) {
(self.character(), self.ccc())
}
pub fn set_ccc_from_trie_if_not_already_set(&mut self, trie: &Trie) {
if self.0 >> 24 != 0xFF {
return;
}
let scalar = self.0 & 0xFFFFFF;
self.0 =
((ccc_from_trie_value(trie.get32_u32(scalar)).to_icu4c_value() as u32) << 24) | scalar;
}
}
#[inline(always)]
fn sort_slice_by_ccc(slice: &mut [CharacterAndClass], trie: &Trie) {
if slice.len() < 2 {
return;
}
slice
.iter_mut()
.for_each(|cc| cc.set_ccc_from_trie_if_not_already_set(trie));
slice.sort_by_key(|cc| cc.ccc());
}
#[derive(Debug)]
pub struct Decomposition<'data, I>
where
I: Iterator<Item = char>,
{
delegate: I,
buffer: SmallVec<[CharacterAndClass; 17]>, buffer_pos: usize,
pending: Option<CharacterAndTrieValue>, trie: &'data Trie<'data>,
scalars16: &'data ZeroSlice<u16>,
scalars24: &'data ZeroSlice<char>,
supplementary_scalars16: &'data ZeroSlice<u16>,
supplementary_scalars24: &'data ZeroSlice<char>,
decomposition_passthrough_bound: u32, ignorable_behavior: IgnorableBehavior, }
impl<'data, I> Decomposition<'data, I>
where
I: Iterator<Item = char>,
{
#[doc(hidden)] pub fn new(
delegate: I,
decompositions: &'data DecompositionData,
tables: &'data DecompositionTables,
) -> Self {
Self::new_with_supplements(
delegate,
decompositions,
tables,
None,
0xC0,
IgnorableBehavior::Unsupported,
)
}
fn new_with_supplements(
delegate: I,
decompositions: &'data DecompositionData,
tables: &'data DecompositionTables,
supplementary_tables: Option<&'data DecompositionTables>,
decomposition_passthrough_bound: u8,
ignorable_behavior: IgnorableBehavior,
) -> Self {
let mut ret = Decomposition::<I> {
delegate,
buffer: SmallVec::new(), buffer_pos: 0,
pending: Some(CharacterAndTrieValue::new('\u{FFFF}', 0)),
#[allow(clippy::useless_conversion, clippy::expect_used)] trie: <&Trie>::try_from(&decompositions.trie).expect("Unexpected trie type in data"),
scalars16: &tables.scalars16,
scalars24: &tables.scalars24,
supplementary_scalars16: if let Some(supplementary) = supplementary_tables {
&supplementary.scalars16
} else {
EMPTY_U16
},
supplementary_scalars24: if let Some(supplementary) = supplementary_tables {
&supplementary.scalars24
} else {
EMPTY_CHAR
},
decomposition_passthrough_bound: u32::from(decomposition_passthrough_bound),
ignorable_behavior,
};
let _ = ret.next(); ret
}
fn push_decomposition16(
&mut self,
offset: usize,
len: usize,
only_non_starters_in_trail: bool,
slice16: &ZeroSlice<u16>,
) -> (char, usize) {
let (starter, tail) = slice16
.get_subslice(offset..offset + len)
.and_then(|slice| slice.split_first())
.map_or_else(
|| {
debug_assert!(false);
(REPLACEMENT_CHARACTER, EMPTY_U16)
},
|(first, trail)| (char_from_u16(first), trail),
);
if only_non_starters_in_trail {
self.buffer.extend(
tail.iter()
.map(|u| CharacterAndClass::new_with_placeholder(char_from_u16(u))),
);
(starter, 0)
} else {
let mut i = 0;
let mut combining_start = 0;
for u in tail.iter() {
let ch = char_from_u16(u);
let trie_value = self.trie.get(ch);
self.buffer.push(CharacterAndClass::new_with_trie_value(
CharacterAndTrieValue::new(ch, trie_value),
));
i += 1;
if !decomposition_starts_with_non_starter(trie_value) {
combining_start = i;
}
}
(starter, combining_start)
}
}
fn push_decomposition32(
&mut self,
offset: usize,
len: usize,
only_non_starters_in_trail: bool,
slice32: &ZeroSlice<char>,
) -> (char, usize) {
let (starter, tail) = slice32
.get_subslice(offset..offset + len)
.and_then(|slice| slice.split_first())
.unwrap_or_else(|| {
debug_assert!(false);
(REPLACEMENT_CHARACTER, EMPTY_CHAR)
});
if only_non_starters_in_trail {
self.buffer
.extend(tail.iter().map(CharacterAndClass::new_with_placeholder));
(starter, 0)
} else {
let mut i = 0;
let mut combining_start = 0;
for ch in tail.iter() {
let trie_value = self.trie.get(ch);
self.buffer.push(CharacterAndClass::new_with_trie_value(
CharacterAndTrieValue::new(ch, trie_value),
));
i += 1;
if !decomposition_starts_with_non_starter(trie_value) {
combining_start = i;
}
}
(starter, combining_start)
}
}
#[inline(always)]
fn attach_trie_value(&self, c: char) -> CharacterAndTrieValue {
CharacterAndTrieValue::new(c, self.trie.get(c))
}
fn delegate_next_no_pending(&mut self) -> Option<CharacterAndTrieValue> {
debug_assert!(self.pending.is_none());
loop {
let c = self.delegate.next()?;
if u32::from(c) < self.decomposition_passthrough_bound {
return Some(CharacterAndTrieValue::new(c, 0));
}
let trie_val = self.trie.get(c);
if trie_val == IGNORABLE_MARKER {
match self.ignorable_behavior {
IgnorableBehavior::Unsupported => {
debug_assert!(false);
}
IgnorableBehavior::ReplacementCharacter => {
return Some(CharacterAndTrieValue::new(
c,
u32::from(REPLACEMENT_CHARACTER) | NON_ROUND_TRIP_MARKER,
));
}
IgnorableBehavior::Ignored => {
continue;
}
}
}
return Some(CharacterAndTrieValue::new(c, trie_val));
}
}
fn delegate_next(&mut self) -> Option<CharacterAndTrieValue> {
if let Some(pending) = self.pending.take() {
Some(pending)
} else {
self.delegate_next_no_pending()
}
}
fn decomposing_next(&mut self, c_and_trie_val: CharacterAndTrieValue) -> char {
let (starter, combining_start) = {
let c = c_and_trie_val.character;
let decomposition = c_and_trie_val.trie_val;
if (decomposition & !(BACKWARD_COMBINING_MARKER | NON_ROUND_TRIP_MARKER)) == 0 {
(c, 0)
} else {
let high_zeros = (decomposition & HIGH_ZEROS_MASK) == 0;
let low_zeros = (decomposition & LOW_ZEROS_MASK) == 0;
if !high_zeros && !low_zeros {
let starter = char_from_u32(decomposition & 0x7FFF);
let combining = char_from_u32((decomposition >> 15) & 0x7FFF);
self.buffer
.push(CharacterAndClass::new_with_placeholder(combining));
(starter, 0)
} else if high_zeros {
let hangul_offset = u32::from(c).wrapping_sub(HANGUL_S_BASE); if hangul_offset < HANGUL_S_COUNT {
debug_assert_eq!(decomposition, 1);
let l = hangul_offset / HANGUL_N_COUNT;
let v = (hangul_offset % HANGUL_N_COUNT) / HANGUL_T_COUNT;
let t = hangul_offset % HANGUL_T_COUNT;
self.buffer.push(CharacterAndClass::new_starter(unsafe {
core::char::from_u32_unchecked(HANGUL_V_BASE + v)
}));
let first = unsafe { core::char::from_u32_unchecked(HANGUL_L_BASE + l) };
if t != 0 {
self.buffer.push(CharacterAndClass::new_starter(unsafe {
core::char::from_u32_unchecked(HANGUL_T_BASE + t)
}));
(first, 2)
} else {
(first, 1)
}
} else {
let singleton = decomposition as u16;
if singleton != FDFA_MARKER {
let starter = char_from_u16(singleton);
(starter, 0)
} else {
self.buffer.extend(FDFA_NFKD.map(|u| {
CharacterAndClass::new_starter(unsafe {
core::char::from_u32_unchecked(u32::from(u))
})
}));
('\u{0635}', 17)
}
}
} else {
debug_assert!(low_zeros);
let offset = (((decomposition & !(0b11 << 30)) >> 16) as usize) - 1;
let len_bits = decomposition & 0b1111;
let only_non_starters_in_trail = (decomposition & 0b10000) != 0;
if offset < self.scalars16.len() {
self.push_decomposition16(
offset,
(len_bits + 2) as usize,
only_non_starters_in_trail,
self.scalars16,
)
} else if offset < self.scalars16.len() + self.scalars24.len() {
self.push_decomposition32(
offset - self.scalars16.len(),
(len_bits + 1) as usize,
only_non_starters_in_trail,
self.scalars24,
)
} else if offset
< self.scalars16.len()
+ self.scalars24.len()
+ self.supplementary_scalars16.len()
{
self.push_decomposition16(
offset - (self.scalars16.len() + self.scalars24.len()),
(len_bits + 2) as usize,
only_non_starters_in_trail,
self.supplementary_scalars16,
)
} else {
self.push_decomposition32(
offset
- (self.scalars16.len()
+ self.scalars24.len()
+ self.supplementary_scalars16.len()),
(len_bits + 1) as usize,
only_non_starters_in_trail,
self.supplementary_scalars24,
)
}
}
}
};
self.gather_and_sort_combining(combining_start);
starter
}
fn gather_and_sort_combining(&mut self, combining_start: usize) {
while let Some(ch_and_trie_val) = self.delegate_next() {
if !trie_value_has_ccc(ch_and_trie_val.trie_val) {
self.pending = Some(ch_and_trie_val);
break;
} else if !trie_value_indicates_special_non_starter_decomposition(
ch_and_trie_val.trie_val,
) {
self.buffer
.push(CharacterAndClass::new_with_trie_value(ch_and_trie_val));
} else {
let mapped = match ch_and_trie_val.character {
'\u{0340}' => {
CharacterAndClass::new('\u{0300}', CCC_ABOVE)
}
'\u{0341}' => {
CharacterAndClass::new('\u{0301}', CCC_ABOVE)
}
'\u{0343}' => {
CharacterAndClass::new('\u{0313}', CCC_ABOVE)
}
'\u{0344}' => {
self.buffer
.push(CharacterAndClass::new('\u{0308}', CCC_ABOVE));
CharacterAndClass::new('\u{0301}', CCC_ABOVE)
}
'\u{0F73}' => {
self.buffer
.push(CharacterAndClass::new('\u{0F71}', ccc!(CCC129, 129)));
CharacterAndClass::new('\u{0F72}', ccc!(CCC130, 130))
}
'\u{0F75}' => {
self.buffer
.push(CharacterAndClass::new('\u{0F71}', ccc!(CCC129, 129)));
CharacterAndClass::new('\u{0F74}', ccc!(CCC132, 132))
}
'\u{0F81}' => {
self.buffer
.push(CharacterAndClass::new('\u{0F71}', ccc!(CCC129, 129)));
CharacterAndClass::new('\u{0F80}', ccc!(CCC130, 130))
}
'\u{FF9E}' => {
CharacterAndClass::new('\u{3099}', ccc!(KanaVoicing, 8))
}
'\u{FF9F}' => {
CharacterAndClass::new('\u{309A}', ccc!(KanaVoicing, 8))
}
_ => {
debug_assert!(false);
CharacterAndClass::new_with_placeholder(REPLACEMENT_CHARACTER)
}
};
self.buffer.push(mapped);
}
}
#[expect(clippy::indexing_slicing)]
sort_slice_by_ccc(&mut self.buffer[combining_start..], self.trie);
}
}
impl<I> Iterator for Decomposition<'_, I>
where
I: Iterator<Item = char>,
{
type Item = char;
fn next(&mut self) -> Option<char> {
if let Some(ret) = self.buffer.get(self.buffer_pos).map(|c| c.character()) {
self.buffer_pos += 1;
if self.buffer_pos == self.buffer.len() {
self.buffer.clear();
self.buffer_pos = 0;
}
return Some(ret);
}
debug_assert_eq!(self.buffer_pos, 0);
let c_and_trie_val = self.pending.take()?;
Some(self.decomposing_next(c_and_trie_val))
}
}
#[derive(Debug)]
pub struct Composition<'data, I>
where
I: Iterator<Item = char>,
{
decomposition: Decomposition<'data, I>,
canonical_compositions: Char16Trie<'data>,
unprocessed_starter: Option<char>,
composition_passthrough_bound: u32,
}
impl<'data, I> Composition<'data, I>
where
I: Iterator<Item = char>,
{
fn new(
decomposition: Decomposition<'data, I>,
canonical_compositions: Char16Trie<'data>,
composition_passthrough_bound: u16,
) -> Self {
Self {
decomposition,
canonical_compositions,
unprocessed_starter: None,
composition_passthrough_bound: u32::from(composition_passthrough_bound),
}
}
#[inline(always)]
pub fn compose(&self, starter: char, second: char) -> Option<char> {
compose(self.canonical_compositions.iter(), starter, second)
}
#[inline(always)]
fn compose_non_hangul(&self, starter: char, second: char) -> Option<char> {
compose_non_hangul(self.canonical_compositions.iter(), starter, second)
}
}
impl<I> Iterator for Composition<'_, I>
where
I: Iterator<Item = char>,
{
type Item = char;
#[inline]
fn next(&mut self) -> Option<char> {
let mut undecomposed_starter = CharacterAndTrieValue::new('\u{0}', 0); if self.unprocessed_starter.is_none() {
#[expect(clippy::never_loop)]
loop {
if let Some((character, ccc)) = self
.decomposition
.buffer
.get(self.decomposition.buffer_pos)
.map(|c| c.character_and_ccc())
{
self.decomposition.buffer_pos += 1;
if self.decomposition.buffer_pos == self.decomposition.buffer.len() {
self.decomposition.buffer.clear();
self.decomposition.buffer_pos = 0;
}
if ccc == CCC_NOT_REORDERED {
self.unprocessed_starter = Some(character);
break; }
return Some(character);
}
debug_assert_eq!(self.decomposition.buffer_pos, 0);
undecomposed_starter = self.decomposition.pending.take()?;
if u32::from(undecomposed_starter.character) < self.composition_passthrough_bound
|| undecomposed_starter.potential_passthrough()
{
if let Some(upcoming) = self.decomposition.delegate_next_no_pending() {
let cannot_combine_backwards = u32::from(upcoming.character)
< self.composition_passthrough_bound
|| !upcoming.can_combine_backwards();
self.decomposition.pending = Some(upcoming);
if cannot_combine_backwards {
return Some(undecomposed_starter.character);
}
} else {
return Some(undecomposed_starter.character);
}
}
break; }
}
let mut starter = '\u{0}';
let mut attempt_composition = false;
loop {
if let Some(unprocessed) = self.unprocessed_starter.take() {
debug_assert_eq!(undecomposed_starter, CharacterAndTrieValue::new('\u{0}', 0));
debug_assert_eq!(starter, '\u{0}');
starter = unprocessed;
} else {
debug_assert_eq!(self.decomposition.buffer_pos, 0);
let next_starter = self.decomposition.decomposing_next(undecomposed_starter);
if !attempt_composition {
starter = next_starter;
} else if let Some(composed) = self.compose(starter, next_starter) {
starter = composed;
} else {
self.unprocessed_starter = Some(next_starter);
return Some(starter);
}
}
loop {
let (character, ccc) = if let Some((character, ccc)) = self
.decomposition
.buffer
.get(self.decomposition.buffer_pos)
.map(|c| c.character_and_ccc())
{
(character, ccc)
} else {
self.decomposition.buffer.clear();
self.decomposition.buffer_pos = 0;
break;
};
if let Some(composed) = self.compose(starter, character) {
starter = composed;
self.decomposition.buffer_pos += 1;
continue;
}
let mut most_recent_skipped_ccc = ccc;
{
let _ = self
.decomposition
.buffer
.drain(0..self.decomposition.buffer_pos);
}
self.decomposition.buffer_pos = 0;
if most_recent_skipped_ccc == CCC_NOT_REORDERED {
return Some(starter);
}
let mut i = 1; while let Some((character, ccc)) = self
.decomposition
.buffer
.get(i)
.map(|c| c.character_and_ccc())
{
if ccc == CCC_NOT_REORDERED {
return Some(starter);
}
debug_assert!(ccc >= most_recent_skipped_ccc);
if ccc != most_recent_skipped_ccc {
if let Some(composed) = self.compose_non_hangul(starter, character) {
self.decomposition.buffer.remove(i);
starter = composed;
continue;
}
}
most_recent_skipped_ccc = ccc;
i += 1;
}
break;
}
debug_assert_eq!(self.decomposition.buffer_pos, 0);
if !self.decomposition.buffer.is_empty() {
return Some(starter);
}
if let Some(pending) = self.decomposition.pending.take() {
if u32::from(pending.character) < self.composition_passthrough_bound
|| !pending.can_combine_backwards()
{
self.decomposition.pending = Some(pending);
return Some(starter);
}
undecomposed_starter = pending;
attempt_composition = true;
continue;
}
return Some(starter);
}
}
}
macro_rules! composing_normalize_to {
($(#[$meta:meta])*,
$normalize_to:ident,
$write:path,
$slice:ty,
$prolog:block,
$always_valid_utf:literal,
$as_slice:ident,
$fast:block,
$text:ident,
$sink:ident,
$composition:ident,
$composition_passthrough_bound:ident,
$undecomposed_starter:ident,
$pending_slice:ident,
$len_utf:ident,
) => {
$(#[$meta])*
pub fn $normalize_to<W: $write + ?Sized>(
&self,
$text: $slice,
$sink: &mut W,
) -> core::fmt::Result {
$prolog
let mut $composition = self.normalize_iter($text.chars());
debug_assert_eq!($composition.decomposition.ignorable_behavior, IgnorableBehavior::Unsupported);
for cc in $composition.decomposition.buffer.drain(..) {
$sink.write_char(cc.character())?;
}
let $composition_passthrough_bound = $composition.composition_passthrough_bound;
'outer: loop {
debug_assert_eq!($composition.decomposition.buffer_pos, 0);
let mut $undecomposed_starter =
if let Some(pending) = $composition.decomposition.pending.take() {
pending
} else {
return Ok(());
};
if u32::from($undecomposed_starter.character) < $composition_passthrough_bound ||
$undecomposed_starter.potential_passthrough()
{
if $always_valid_utf || $undecomposed_starter.character != REPLACEMENT_CHARACTER {
let $pending_slice = &$text[$text.len() - $composition.decomposition.delegate.$as_slice().len() - $undecomposed_starter.character.$len_utf()..];
$fast
}
}
let mut starter = $composition
.decomposition
.decomposing_next($undecomposed_starter);
'bufferloop: loop {
loop {
let (character, ccc) = if let Some((character, ccc)) = $composition
.decomposition
.buffer
.get($composition.decomposition.buffer_pos)
.map(|c| c.character_and_ccc())
{
(character, ccc)
} else {
$composition.decomposition.buffer.clear();
$composition.decomposition.buffer_pos = 0;
break;
};
if let Some(composed) = $composition.compose(starter, character) {
starter = composed;
$composition.decomposition.buffer_pos += 1;
continue;
}
let mut most_recent_skipped_ccc = ccc;
if most_recent_skipped_ccc == CCC_NOT_REORDERED {
$sink.write_char(starter)?;
starter = character;
$composition.decomposition.buffer_pos += 1;
continue 'bufferloop;
} else {
{
let _ = $composition
.decomposition
.buffer
.drain(0..$composition.decomposition.buffer_pos);
}
$composition.decomposition.buffer_pos = 0;
}
let mut i = 1; while let Some((character, ccc)) = $composition
.decomposition
.buffer
.get(i)
.map(|c| c.character_and_ccc())
{
if ccc == CCC_NOT_REORDERED {
$sink.write_char(starter)?;
for cc in $composition.decomposition.buffer.drain(..i) {
$sink.write_char(cc.character())?;
}
starter = character;
{
let removed = $composition.decomposition.buffer.remove(0);
debug_assert_eq!(starter, removed.character());
}
debug_assert_eq!($composition.decomposition.buffer_pos, 0);
continue 'bufferloop;
}
debug_assert!(ccc >= most_recent_skipped_ccc);
if ccc != most_recent_skipped_ccc {
if let Some(composed) =
$composition.compose_non_hangul(starter, character)
{
$composition.decomposition.buffer.remove(i);
starter = composed;
continue;
}
}
most_recent_skipped_ccc = ccc;
i += 1;
}
break;
}
debug_assert_eq!($composition.decomposition.buffer_pos, 0);
if !$composition.decomposition.buffer.is_empty() {
$sink.write_char(starter)?;
for cc in $composition.decomposition.buffer.drain(..) {
$sink.write_char(cc.character())?;
}
continue 'outer;
}
if $composition.decomposition.pending.is_some() {
let pending = $composition.decomposition.pending.as_ref().unwrap();
if u32::from(pending.character) < $composition.composition_passthrough_bound
|| !pending.can_combine_backwards()
{
$sink.write_char(starter)?;
continue 'outer;
}
let pending_starter = $composition.decomposition.pending.take().unwrap();
let decomposed = $composition.decomposition.decomposing_next(pending_starter);
if let Some(composed) = $composition.compose(starter, decomposed) {
starter = composed;
} else {
$sink.write_char(starter)?;
starter = decomposed;
}
continue 'bufferloop;
}
$sink.write_char(starter)?;
return Ok(());
} }
}
};
}
macro_rules! decomposing_normalize_to {
($(#[$meta:meta])*,
$normalize_to:ident,
$write:path,
$slice:ty,
$prolog:block,
$as_slice:ident,
$fast:block,
$text:ident,
$sink:ident,
$decomposition:ident,
$decomposition_passthrough_bound:ident,
$undecomposed_starter:ident,
$pending_slice:ident,
$outer:lifetime, // loop labels use lifetime tokens
) => {
$(#[$meta])*
pub fn $normalize_to<W: $write + ?Sized>(
&self,
$text: $slice,
$sink: &mut W,
) -> core::fmt::Result {
$prolog
let mut $decomposition = self.normalize_iter($text.chars());
debug_assert_eq!($decomposition.ignorable_behavior, IgnorableBehavior::Unsupported);
let $decomposition_passthrough_bound = $decomposition.decomposition_passthrough_bound;
$outer: loop {
for cc in $decomposition.buffer.drain(..) {
$sink.write_char(cc.character())?;
}
debug_assert_eq!($decomposition.buffer_pos, 0);
let mut $undecomposed_starter = if let Some(pending) = $decomposition.pending.take() {
pending
} else {
return Ok(());
};
if $undecomposed_starter.starter_and_decomposes_to_self() {
$sink.write_char($undecomposed_starter.character)?;
let $pending_slice = $decomposition.delegate.$as_slice();
$fast
}
let starter = $decomposition.decomposing_next($undecomposed_starter);
$sink.write_char(starter)?;
}
}
};
}
macro_rules! normalizer_methods {
() => {
pub fn normalize<'a>(&self, text: &'a str) -> Cow<'a, str> {
let (head, tail) = self.split_normalized(text);
if tail.is_empty() {
return Cow::Borrowed(head);
}
let mut ret = String::new();
ret.reserve(text.len());
ret.push_str(head);
let _ = self.normalize_to(tail, &mut ret);
Cow::Owned(ret)
}
pub fn split_normalized<'a>(&self, text: &'a str) -> (&'a str, &'a str) {
let up_to = self.is_normalized_up_to(text);
text.split_at_checked(up_to).unwrap_or_else(|| {
debug_assert!(false);
("", text)
})
}
fn is_normalized_up_to(&self, text: &str) -> usize {
let mut sink = IsNormalizedSinkStr::new(text);
let _ = self.normalize_to(text, &mut sink);
text.len() - sink.remaining_len()
}
pub fn is_normalized(&self, text: &str) -> bool {
self.is_normalized_up_to(text) == text.len()
}
#[cfg(feature = "utf16_iter")]
pub fn normalize_utf16<'a>(&self, text: &'a [u16]) -> Cow<'a, [u16]> {
let (head, tail) = self.split_normalized_utf16(text);
if tail.is_empty() {
return Cow::Borrowed(head);
}
let mut ret = alloc::vec::Vec::with_capacity(text.len());
ret.extend_from_slice(head);
let _ = self.normalize_utf16_to(tail, &mut ret);
Cow::Owned(ret)
}
#[cfg(feature = "utf16_iter")]
pub fn split_normalized_utf16<'a>(&self, text: &'a [u16]) -> (&'a [u16], &'a [u16]) {
let up_to = self.is_normalized_utf16_up_to(text);
text.split_at_checked(up_to).unwrap_or_else(|| {
debug_assert!(false);
(&[], text)
})
}
#[cfg(feature = "utf16_iter")]
fn is_normalized_utf16_up_to(&self, text: &[u16]) -> usize {
let mut sink = IsNormalizedSinkUtf16::new(text);
let _ = self.normalize_utf16_to(text, &mut sink);
text.len() - sink.remaining_len()
}
#[cfg(feature = "utf16_iter")]
pub fn is_normalized_utf16(&self, text: &[u16]) -> bool {
self.is_normalized_utf16_up_to(text) == text.len()
}
#[cfg(feature = "utf8_iter")]
pub fn normalize_utf8<'a>(&self, text: &'a [u8]) -> Cow<'a, str> {
let (head, tail) = self.split_normalized_utf8(text);
if tail.is_empty() {
return Cow::Borrowed(head);
}
let mut ret = String::new();
ret.reserve(text.len());
ret.push_str(head);
let _ = self.normalize_utf8_to(tail, &mut ret);
Cow::Owned(ret)
}
#[cfg(feature = "utf8_iter")]
pub fn split_normalized_utf8<'a>(&self, text: &'a [u8]) -> (&'a str, &'a [u8]) {
let up_to = self.is_normalized_utf8_up_to(text);
let (head, tail) = text.split_at_checked(up_to).unwrap_or_else(|| {
debug_assert!(false);
(&[], text)
});
(unsafe { core::str::from_utf8_unchecked(head) }, tail)
}
#[cfg(feature = "utf8_iter")]
fn is_normalized_utf8_up_to(&self, text: &[u8]) -> usize {
let mut sink = IsNormalizedSinkUtf8::new(text);
let _ = self.normalize_utf8_to(text, &mut sink);
text.len() - sink.remaining_len()
}
#[cfg(feature = "utf8_iter")]
pub fn is_normalized_utf8(&self, text: &[u8]) -> bool {
self.is_normalized_utf8_up_to(text) == text.len()
}
};
}
#[derive(Debug)]
pub struct DecomposingNormalizerBorrowed<'a> {
decompositions: &'a DecompositionData<'a>,
tables: &'a DecompositionTables<'a>,
supplementary_tables: Option<&'a DecompositionTables<'a>>,
decomposition_passthrough_bound: u8, composition_passthrough_bound: u16, }
impl DecomposingNormalizerBorrowed<'static> {
pub const fn static_to_owned(self) -> DecomposingNormalizer {
DecomposingNormalizer {
decompositions: DataPayload::from_static_ref(self.decompositions),
tables: DataPayload::from_static_ref(self.tables),
supplementary_tables: if let Some(s) = self.supplementary_tables {
Some(DataPayload::from_static_ref(s))
} else {
None
},
decomposition_passthrough_bound: self.decomposition_passthrough_bound,
composition_passthrough_bound: self.composition_passthrough_bound,
}
}
#[cfg(feature = "compiled_data")]
pub const fn new_nfd() -> Self {
const _: () = assert!(
provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1
.scalars16
.const_len()
+ provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1
.scalars24
.const_len()
<= 0xFFF,
"future extension"
);
DecomposingNormalizerBorrowed {
decompositions: provider::Baked::SINGLETON_NORMALIZER_NFD_DATA_V1,
tables: provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1,
supplementary_tables: None,
decomposition_passthrough_bound: 0xC0,
composition_passthrough_bound: 0x0300,
}
}
#[cfg(feature = "compiled_data")]
pub const fn new_nfkd() -> Self {
const _: () = assert!(
provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1
.scalars16
.const_len()
+ provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1
.scalars24
.const_len()
+ provider::Baked::SINGLETON_NORMALIZER_NFKD_TABLES_V1
.scalars16
.const_len()
+ provider::Baked::SINGLETON_NORMALIZER_NFKD_TABLES_V1
.scalars24
.const_len()
<= 0xFFF,
"future extension"
);
const _: () = assert!(
provider::Baked::SINGLETON_NORMALIZER_NFKD_DATA_V1.passthrough_cap <= 0x0300,
"invalid"
);
let decomposition_capped =
if provider::Baked::SINGLETON_NORMALIZER_NFKD_DATA_V1.passthrough_cap < 0xC0 {
provider::Baked::SINGLETON_NORMALIZER_NFKD_DATA_V1.passthrough_cap
} else {
0xC0
};
let composition_capped =
if provider::Baked::SINGLETON_NORMALIZER_NFKD_DATA_V1.passthrough_cap < 0x0300 {
provider::Baked::SINGLETON_NORMALIZER_NFKD_DATA_V1.passthrough_cap
} else {
0x0300
};
DecomposingNormalizerBorrowed {
decompositions: provider::Baked::SINGLETON_NORMALIZER_NFKD_DATA_V1,
tables: provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1,
supplementary_tables: Some(provider::Baked::SINGLETON_NORMALIZER_NFKD_TABLES_V1),
decomposition_passthrough_bound: decomposition_capped as u8,
composition_passthrough_bound: composition_capped,
}
}
#[cfg(feature = "compiled_data")]
pub(crate) const fn new_uts46_decomposed() -> Self {
const _: () = assert!(
provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1
.scalars16
.const_len()
+ provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1
.scalars24
.const_len()
+ provider::Baked::SINGLETON_NORMALIZER_NFKD_TABLES_V1
.scalars16
.const_len()
+ provider::Baked::SINGLETON_NORMALIZER_NFKD_TABLES_V1
.scalars24
.const_len()
<= 0xFFF,
"future extension"
);
const _: () = assert!(
provider::Baked::SINGLETON_NORMALIZER_UTS46_DATA_V1.passthrough_cap <= 0x0300,
"invalid"
);
let decomposition_capped =
if provider::Baked::SINGLETON_NORMALIZER_UTS46_DATA_V1.passthrough_cap < 0xC0 {
provider::Baked::SINGLETON_NORMALIZER_UTS46_DATA_V1.passthrough_cap
} else {
0xC0
};
let composition_capped =
if provider::Baked::SINGLETON_NORMALIZER_UTS46_DATA_V1.passthrough_cap < 0x0300 {
provider::Baked::SINGLETON_NORMALIZER_UTS46_DATA_V1.passthrough_cap
} else {
0x0300
};
DecomposingNormalizerBorrowed {
decompositions: provider::Baked::SINGLETON_NORMALIZER_UTS46_DATA_V1,
tables: provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1,
supplementary_tables: Some(provider::Baked::SINGLETON_NORMALIZER_NFKD_TABLES_V1),
decomposition_passthrough_bound: decomposition_capped as u8,
composition_passthrough_bound: composition_capped,
}
}
}
impl<'data> DecomposingNormalizerBorrowed<'data> {
#[doc(hidden)]
pub fn new_with_data(
decompositions: &'data DecompositionData<'data>,
tables: &'data DecompositionTables<'data>,
) -> Self {
Self {
decompositions,
tables,
supplementary_tables: None,
decomposition_passthrough_bound: 0xC0,
composition_passthrough_bound: 0x0300,
}
}
pub fn normalize_iter<I: Iterator<Item = char>>(&self, iter: I) -> Decomposition<'data, I> {
Decomposition::new_with_supplements(
iter,
self.decompositions,
self.tables,
self.supplementary_tables,
self.decomposition_passthrough_bound,
IgnorableBehavior::Unsupported,
)
}
normalizer_methods!();
decomposing_normalize_to!(
,
normalize_to,
core::fmt::Write,
&str,
{
},
as_str,
{
let decomposition_passthrough_byte_bound = if decomposition_passthrough_bound == 0xC0 {
0xC3u8
} else {
decomposition_passthrough_bound.min(0x80) as u8
};
#[expect(clippy::unwrap_used)]
'fast: loop {
let mut code_unit_iter = decomposition.delegate.as_str().as_bytes().iter();
'fastest: loop {
if let Some(&upcoming_byte) = code_unit_iter.next() {
if upcoming_byte < decomposition_passthrough_byte_bound {
continue 'fastest;
}
decomposition.delegate = pending_slice[pending_slice.len() - code_unit_iter.as_slice().len() - 1..].chars();
break 'fastest;
}
sink.write_str(pending_slice)?;
return Ok(());
}
let upcoming = decomposition.delegate.next().unwrap();
let upcoming_with_trie_value = decomposition.attach_trie_value(upcoming);
if upcoming_with_trie_value.starter_and_decomposes_to_self() {
continue 'fast;
}
let consumed_so_far_slice = &pending_slice[..pending_slice.len()
- decomposition.delegate.as_str().len()
- upcoming.len_utf8()];
sink.write_str(consumed_so_far_slice)?;
if decomposition_starts_with_non_starter(
upcoming_with_trie_value.trie_val,
) {
decomposition.pending = Some(upcoming_with_trie_value);
decomposition.gather_and_sort_combining(0);
continue 'outer;
}
undecomposed_starter = upcoming_with_trie_value;
debug_assert!(decomposition.pending.is_none());
break 'fast;
}
},
text,
sink,
decomposition,
decomposition_passthrough_bound,
undecomposed_starter,
pending_slice,
'outer,
);
decomposing_normalize_to!(
#[cfg(feature = "utf8_iter")]
,
normalize_utf8_to,
core::fmt::Write,
&[u8],
{
},
as_slice,
{
let decomposition_passthrough_byte_bound = decomposition_passthrough_bound.min(0x80) as u8;
'fast: loop {
let mut code_unit_iter = decomposition.delegate.as_slice().iter();
'fastest: loop {
if let Some(&upcoming_byte) = code_unit_iter.next() {
if upcoming_byte < decomposition_passthrough_byte_bound {
continue 'fastest;
}
break 'fastest;
}
sink.write_str(unsafe { core::str::from_utf8_unchecked(pending_slice) })?;
return Ok(());
}
#[expect(clippy::indexing_slicing)]
{decomposition.delegate = pending_slice[pending_slice.len() - code_unit_iter.as_slice().len() - 1..].chars();}
#[expect(clippy::unwrap_used)]
let upcoming = decomposition.delegate.next().unwrap();
let upcoming_with_trie_value = decomposition.attach_trie_value(upcoming);
if upcoming_with_trie_value.starter_and_decomposes_to_self_except_replacement() {
continue 'fast;
}
if upcoming == REPLACEMENT_CHARACTER {
#[expect(clippy::indexing_slicing)]
let mut consumed_so_far = pending_slice[..pending_slice.len() - decomposition.delegate.as_slice().len()].chars();
let back = consumed_so_far.next_back();
debug_assert_eq!(back, Some(REPLACEMENT_CHARACTER));
let consumed_so_far_slice = consumed_so_far.as_slice();
sink.write_str(unsafe { core::str::from_utf8_unchecked(consumed_so_far_slice) } )?;
undecomposed_starter = upcoming_with_trie_value;
debug_assert!(decomposition.pending.is_none());
break 'fast;
}
#[expect(clippy::indexing_slicing)]
let consumed_so_far_slice = &pending_slice[..pending_slice.len()
- decomposition.delegate.as_slice().len()
- upcoming.len_utf8()];
sink.write_str(unsafe { core::str::from_utf8_unchecked(consumed_so_far_slice) } )?;
if decomposition_starts_with_non_starter(
upcoming_with_trie_value.trie_val,
) {
decomposition.pending = Some(upcoming_with_trie_value);
decomposition.gather_and_sort_combining(0);
continue 'outer;
}
undecomposed_starter = upcoming_with_trie_value;
debug_assert!(decomposition.pending.is_none());
break 'fast;
}
},
text,
sink,
decomposition,
decomposition_passthrough_bound,
undecomposed_starter,
pending_slice,
'outer,
);
decomposing_normalize_to!(
#[cfg(feature = "utf16_iter")]
,
normalize_utf16_to,
write16::Write16,
&[u16],
{
sink.size_hint(text.len())?;
},
as_slice,
{
#[expect(clippy::never_loop)]
'fastwrap: loop {
let delegate_as_slice = decomposition.delegate.as_slice();
let mut ptr: *const u16 = delegate_as_slice.as_ptr();
let end: *const u16 = unsafe { ptr.add(delegate_as_slice.len()) };
'fast: loop {
if ptr != end {
let upcoming_code_unit = unsafe { *ptr };
ptr = unsafe { ptr.add(1) };
let mut upcoming32 = u32::from(upcoming_code_unit);
if upcoming32 < decomposition_passthrough_bound {
continue 'fast;
}
let mut trie_value = decomposition.trie.get16(upcoming_code_unit);
if starter_and_decomposes_to_self_impl(trie_value) {
continue 'fast;
}
#[expect(clippy::never_loop)]
'surrogateloop: loop {
let surrogate_base = upcoming32.wrapping_sub(0xD800);
if likely(surrogate_base > (0xDFFF - 0xD800)) {
break 'surrogateloop;
}
if likely(surrogate_base <= (0xDBFF - 0xD800)) {
if ptr != end {
let low = unsafe { *ptr };
if likely(in_inclusive_range16(low, 0xDC00, 0xDFFF)) {
ptr = unsafe { ptr.add(1) };
upcoming32 = (upcoming32 << 10) + u32::from(low)
- (((0xD800u32 << 10) - 0x10000u32) + 0xDC00u32);
trie_value = {
#[cfg(not(icu4x_unstable_fast_trie_only))]
{decomposition.trie.get32(upcoming32)}
#[cfg(icu4x_unstable_fast_trie_only)]
{decomposition.trie.get32_supplementary(upcoming32)}
};
if likely(starter_and_decomposes_to_self_impl(trie_value)) {
continue 'fast;
}
break 'surrogateloop;
}
}
}
upcoming32 = 0xFFFD; break 'surrogateloop;
}
let upcoming = unsafe { char::from_u32_unchecked(upcoming32) };
let upcoming_with_trie_value = CharacterAndTrieValue::new(upcoming, trie_value);
let Some(consumed_so_far_slice) = pending_slice.get(..pending_slice.len() -
unsafe { end.offset_from(ptr) as usize }
- upcoming.len_utf16()) else {
debug_assert!(false);
break 'fastwrap;
};
sink.write_slice(consumed_so_far_slice)?;
if decomposition_starts_with_non_starter(
upcoming_with_trie_value.trie_val,
) {
decomposition.delegate = unsafe { core::slice::from_raw_parts(ptr, end.offset_from(ptr) as usize) }.chars();
decomposition.pending = Some(upcoming_with_trie_value);
decomposition.gather_and_sort_combining(0);
continue 'outer;
}
undecomposed_starter = upcoming_with_trie_value;
debug_assert!(decomposition.pending.is_none());
break 'fast;
}
sink.write_slice(pending_slice)?;
return Ok(());
}
decomposition.delegate = unsafe { core::slice::from_raw_parts(ptr, end.offset_from(ptr) as usize) }.chars();
break 'fastwrap;
}
},
text,
sink,
decomposition,
decomposition_passthrough_bound,
undecomposed_starter,
pending_slice,
'outer,
);
}
#[derive(Debug)]
pub struct DecomposingNormalizer {
decompositions: DataPayload<NormalizerNfdDataV1>,
tables: DataPayload<NormalizerNfdTablesV1>,
supplementary_tables: Option<DataPayload<NormalizerNfkdTablesV1>>,
decomposition_passthrough_bound: u8, composition_passthrough_bound: u16, }
impl DecomposingNormalizer {
pub fn as_borrowed(&self) -> DecomposingNormalizerBorrowed<'_> {
DecomposingNormalizerBorrowed {
decompositions: self.decompositions.get(),
tables: self.tables.get(),
supplementary_tables: self.supplementary_tables.as_ref().map(|s| s.get()),
decomposition_passthrough_bound: self.decomposition_passthrough_bound,
composition_passthrough_bound: self.composition_passthrough_bound,
}
}
#[cfg(feature = "compiled_data")]
pub const fn new_nfd() -> DecomposingNormalizerBorrowed<'static> {
DecomposingNormalizerBorrowed::new_nfd()
}
icu_provider::gen_buffer_data_constructors!(
() -> error: DataError,
functions: [
new_nfd: skip,
try_new_nfd_with_buffer_provider,
try_new_nfd_unstable,
Self,
]
);
#[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_nfd)]
pub fn try_new_nfd_unstable<D>(provider: &D) -> Result<Self, DataError>
where
D: DataProvider<NormalizerNfdDataV1> + DataProvider<NormalizerNfdTablesV1> + ?Sized,
{
let decompositions: DataPayload<NormalizerNfdDataV1> =
provider.load(Default::default())?.payload;
let tables: DataPayload<NormalizerNfdTablesV1> = provider.load(Default::default())?.payload;
if tables.get().scalars16.len() + tables.get().scalars24.len() > 0xFFF {
return Err(
DataError::custom("future extension").with_marker(NormalizerNfdTablesV1::INFO)
);
}
let cap = decompositions.get().passthrough_cap;
if cap > 0x0300 {
return Err(DataError::custom("invalid").with_marker(NormalizerNfdDataV1::INFO));
}
let decomposition_capped = cap.min(0xC0);
let composition_capped = cap.min(0x0300);
Ok(DecomposingNormalizer {
decompositions,
tables,
supplementary_tables: None,
decomposition_passthrough_bound: decomposition_capped as u8,
composition_passthrough_bound: composition_capped,
})
}
icu_provider::gen_buffer_data_constructors!(
() -> error: DataError,
functions: [
new_nfkd: skip,
try_new_nfkd_with_buffer_provider,
try_new_nfkd_unstable,
Self,
]
);
#[cfg(feature = "compiled_data")]
pub const fn new_nfkd() -> DecomposingNormalizerBorrowed<'static> {
DecomposingNormalizerBorrowed::new_nfkd()
}
#[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_nfkd)]
pub fn try_new_nfkd_unstable<D>(provider: &D) -> Result<Self, DataError>
where
D: DataProvider<NormalizerNfkdDataV1>
+ DataProvider<NormalizerNfdTablesV1>
+ DataProvider<NormalizerNfkdTablesV1>
+ ?Sized,
{
let decompositions: DataPayload<NormalizerNfkdDataV1> =
provider.load(Default::default())?.payload;
let tables: DataPayload<NormalizerNfdTablesV1> = provider.load(Default::default())?.payload;
let supplementary_tables: DataPayload<NormalizerNfkdTablesV1> =
provider.load(Default::default())?.payload;
if tables.get().scalars16.len()
+ tables.get().scalars24.len()
+ supplementary_tables.get().scalars16.len()
+ supplementary_tables.get().scalars24.len()
> 0xFFF
{
return Err(
DataError::custom("future extension").with_marker(NormalizerNfdTablesV1::INFO)
);
}
let cap = decompositions.get().passthrough_cap;
if cap > 0x0300 {
return Err(DataError::custom("invalid").with_marker(NormalizerNfkdDataV1::INFO));
}
let decomposition_capped = cap.min(0xC0);
let composition_capped = cap.min(0x0300);
Ok(DecomposingNormalizer {
decompositions: decompositions.cast(),
tables,
supplementary_tables: Some(supplementary_tables),
decomposition_passthrough_bound: decomposition_capped as u8,
composition_passthrough_bound: composition_capped,
})
}
pub(crate) fn try_new_uts46_decomposed_unstable<D>(provider: &D) -> Result<Self, DataError>
where
D: DataProvider<NormalizerUts46DataV1>
+ DataProvider<NormalizerNfdTablesV1>
+ DataProvider<NormalizerNfkdTablesV1>
+ ?Sized,
{
let decompositions: DataPayload<NormalizerUts46DataV1> =
provider.load(Default::default())?.payload;
let tables: DataPayload<NormalizerNfdTablesV1> = provider.load(Default::default())?.payload;
let supplementary_tables: DataPayload<NormalizerNfkdTablesV1> =
provider.load(Default::default())?.payload;
if tables.get().scalars16.len()
+ tables.get().scalars24.len()
+ supplementary_tables.get().scalars16.len()
+ supplementary_tables.get().scalars24.len()
> 0xFFF
{
return Err(
DataError::custom("future extension").with_marker(NormalizerNfdTablesV1::INFO)
);
}
let cap = decompositions.get().passthrough_cap;
if cap > 0x0300 {
return Err(DataError::custom("invalid").with_marker(NormalizerUts46DataV1::INFO));
}
let decomposition_capped = cap.min(0xC0);
let composition_capped = cap.min(0x0300);
Ok(DecomposingNormalizer {
decompositions: decompositions.cast(),
tables,
supplementary_tables: Some(supplementary_tables),
decomposition_passthrough_bound: decomposition_capped as u8,
composition_passthrough_bound: composition_capped,
})
}
}
#[derive(Debug)]
pub struct ComposingNormalizerBorrowed<'a> {
decomposing_normalizer: DecomposingNormalizerBorrowed<'a>,
canonical_compositions: &'a CanonicalCompositions<'a>,
}
impl ComposingNormalizerBorrowed<'static> {
pub const fn static_to_owned(self) -> ComposingNormalizer {
ComposingNormalizer {
decomposing_normalizer: self.decomposing_normalizer.static_to_owned(),
canonical_compositions: DataPayload::from_static_ref(self.canonical_compositions),
}
}
#[cfg(feature = "compiled_data")]
pub const fn new_nfc() -> Self {
ComposingNormalizerBorrowed {
decomposing_normalizer: DecomposingNormalizerBorrowed::new_nfd(),
canonical_compositions: provider::Baked::SINGLETON_NORMALIZER_NFC_V1,
}
}
#[cfg(feature = "compiled_data")]
pub const fn new_nfkc() -> Self {
ComposingNormalizerBorrowed {
decomposing_normalizer: DecomposingNormalizerBorrowed::new_nfkd(),
canonical_compositions: provider::Baked::SINGLETON_NORMALIZER_NFC_V1,
}
}
#[cfg(feature = "compiled_data")]
pub(crate) const fn new_uts46() -> Self {
ComposingNormalizerBorrowed {
decomposing_normalizer: DecomposingNormalizerBorrowed::new_uts46_decomposed(),
canonical_compositions: provider::Baked::SINGLETON_NORMALIZER_NFC_V1,
}
}
}
impl<'data> ComposingNormalizerBorrowed<'data> {
pub fn normalize_iter<I: Iterator<Item = char>>(&self, iter: I) -> Composition<'data, I> {
self.normalize_iter_private(iter, IgnorableBehavior::Unsupported)
}
fn normalize_iter_private<I: Iterator<Item = char>>(
&self,
iter: I,
ignorable_behavior: IgnorableBehavior,
) -> Composition<'data, I> {
Composition::new(
Decomposition::new_with_supplements(
iter,
self.decomposing_normalizer.decompositions,
self.decomposing_normalizer.tables,
self.decomposing_normalizer.supplementary_tables,
self.decomposing_normalizer.decomposition_passthrough_bound,
ignorable_behavior,
),
self.canonical_compositions.canonical_compositions.clone(),
self.decomposing_normalizer.composition_passthrough_bound,
)
}
normalizer_methods!();
composing_normalize_to!(
,
normalize_to,
core::fmt::Write,
&str,
{},
true,
as_str,
{
let composition_passthrough_byte_bound = if composition_passthrough_bound == 0x300 {
0xCCu8
} else {
composition_passthrough_bound.min(0x80) as u8
};
#[expect(clippy::unwrap_used)]
'fast: loop {
let mut code_unit_iter = composition.decomposition.delegate.as_str().as_bytes().iter();
'fastest: loop {
if let Some(&upcoming_byte) = code_unit_iter.next() {
if upcoming_byte < composition_passthrough_byte_bound {
continue 'fastest;
}
let Some(remaining_slice) = pending_slice.get(pending_slice.len() - code_unit_iter.as_slice().len() - 1..) else {
debug_assert!(false);
break 'fastest;
};
composition.decomposition.delegate = remaining_slice.chars();
break 'fastest;
}
sink.write_str(pending_slice)?;
return Ok(());
}
let upcoming = composition.decomposition.delegate.next().unwrap();
let upcoming_with_trie_value = composition.decomposition.attach_trie_value(upcoming);
if upcoming_with_trie_value.potential_passthrough_and_cannot_combine_backwards() {
continue 'fast;
}
composition.decomposition.pending = Some(upcoming_with_trie_value);
let mut consumed_so_far = pending_slice[..pending_slice.len() - composition.decomposition.delegate.as_str().len() - upcoming.len_utf8()].chars();
undecomposed_starter = composition.decomposition.attach_trie_value(consumed_so_far.next_back().unwrap());
let consumed_so_far_slice = consumed_so_far.as_str();
sink.write_str(consumed_so_far_slice)?;
break 'fast;
}
},
text,
sink,
composition,
composition_passthrough_bound,
undecomposed_starter,
pending_slice,
len_utf8,
);
composing_normalize_to!(
#[cfg(feature = "utf8_iter")]
,
normalize_utf8_to,
core::fmt::Write,
&[u8],
{},
false,
as_slice,
{
'fast: loop {
if let Some(upcoming) = composition.decomposition.delegate.next() {
if u32::from(upcoming) < composition_passthrough_bound {
continue 'fast;
}
let upcoming_with_trie_value = composition.decomposition.attach_trie_value(upcoming);
if upcoming_with_trie_value.potential_passthrough_and_cannot_combine_backwards() {
continue 'fast;
}
if upcoming == REPLACEMENT_CHARACTER {
#[expect(clippy::indexing_slicing)]
let mut consumed_so_far = pending_slice[..pending_slice.len() - composition.decomposition.delegate.as_slice().len()].chars();
let back = consumed_so_far.next_back();
debug_assert_eq!(back, Some(REPLACEMENT_CHARACTER));
let consumed_so_far_slice = consumed_so_far.as_slice();
sink.write_str(unsafe { core::str::from_utf8_unchecked(consumed_so_far_slice) })?;
undecomposed_starter = CharacterAndTrieValue::new(REPLACEMENT_CHARACTER, 0);
composition.decomposition.pending = None;
break 'fast;
}
composition.decomposition.pending = Some(upcoming_with_trie_value);
#[expect(clippy::indexing_slicing)]
let mut consumed_so_far = pending_slice[..pending_slice.len() - composition.decomposition.delegate.as_slice().len() - upcoming.len_utf8()].chars();
#[expect(clippy::unwrap_used)]
{
undecomposed_starter = composition.decomposition.attach_trie_value(consumed_so_far.next_back().unwrap());
}
let consumed_so_far_slice = consumed_so_far.as_slice();
sink.write_str(unsafe { core::str::from_utf8_unchecked(consumed_so_far_slice)})?;
break 'fast;
}
sink.write_str(unsafe { core::str::from_utf8_unchecked(pending_slice) })?;
return Ok(());
}
},
text,
sink,
composition,
composition_passthrough_bound,
undecomposed_starter,
pending_slice,
len_utf8,
);
composing_normalize_to!(
#[cfg(feature = "utf16_iter")]
,
normalize_utf16_to,
write16::Write16,
&[u16],
{
sink.size_hint(text.len())?;
},
false,
as_slice,
{
#[expect(clippy::never_loop)]
'fastwrap: loop {
let delegate_as_slice = composition.decomposition.delegate.as_slice();
let mut ptr: *const u16 = delegate_as_slice.as_ptr();
let end: *const u16 = unsafe { ptr.add(delegate_as_slice.len()) };
'fast: loop {
if ptr != end {
let upcoming_code_unit = unsafe { *ptr };
ptr = unsafe { ptr.add(1) };
let mut upcoming32 = u32::from(upcoming_code_unit); if upcoming32 < composition_passthrough_bound {
continue 'fast;
}
let mut trie_value = composition.decomposition.trie.get16(upcoming_code_unit);
if potential_passthrough_and_cannot_combine_backwards_impl(trie_value) {
continue 'fast;
}
#[expect(clippy::never_loop)]
'surrogateloop: loop {
let surrogate_base = upcoming32.wrapping_sub(0xD800);
if likely(surrogate_base > (0xDFFF - 0xD800)) {
break 'surrogateloop;
}
if likely(surrogate_base <= (0xDBFF - 0xD800)) {
if ptr != end {
let low = unsafe { *ptr };
if likely(in_inclusive_range16(low, 0xDC00, 0xDFFF)) {
ptr = unsafe { ptr.add(1) };
upcoming32 = (upcoming32 << 10) + u32::from(low)
- (((0xD800u32 << 10) - 0x10000u32) + 0xDC00u32);
trie_value = {
#[cfg(not(icu4x_unstable_fast_trie_only))]
{composition.decomposition.trie.get32(upcoming32)}
#[cfg(icu4x_unstable_fast_trie_only)]
{composition.decomposition.trie.get32_supplementary(upcoming32)}
};
if likely(potential_passthrough_and_cannot_combine_backwards_impl(trie_value)) {
continue 'fast;
}
break 'surrogateloop;
}
}
}
upcoming32 = 0xFFFD; debug_assert_eq!(trie_value, NON_ROUND_TRIP_MARKER | BACKWARD_COMBINING_MARKER | 0xFFFD);
break 'surrogateloop;
}
let upcoming = unsafe { char::from_u32_unchecked(upcoming32) };
let upcoming_with_trie_value = CharacterAndTrieValue::new(upcoming, trie_value);
composition.decomposition.pending = Some(upcoming_with_trie_value);
let Some(consumed_so_far_slice) = pending_slice.get(..pending_slice.len() -
unsafe { end.offset_from(ptr) as usize }
- upcoming.len_utf16()) else {
debug_assert!(false);
break 'fastwrap;
};
let mut consumed_so_far = consumed_so_far_slice.chars();
let Some(c_from_back) = consumed_so_far.next_back() else {
debug_assert!(false);
break 'fastwrap;
};
undecomposed_starter = composition.decomposition.attach_trie_value(c_from_back);
sink.write_slice(consumed_so_far.as_slice())?;
break 'fast;
}
sink.write_slice(pending_slice)?;
return Ok(());
}
composition.decomposition.delegate = unsafe { core::slice::from_raw_parts(ptr, end.offset_from(ptr) as usize) }.chars();
break 'fastwrap;
}
},
text,
sink,
composition,
composition_passthrough_bound,
undecomposed_starter,
pending_slice,
len_utf16,
);
}
#[derive(Debug)]
pub struct ComposingNormalizer {
decomposing_normalizer: DecomposingNormalizer,
canonical_compositions: DataPayload<NormalizerNfcV1>,
}
impl ComposingNormalizer {
pub fn as_borrowed(&self) -> ComposingNormalizerBorrowed<'_> {
ComposingNormalizerBorrowed {
decomposing_normalizer: self.decomposing_normalizer.as_borrowed(),
canonical_compositions: self.canonical_compositions.get(),
}
}
#[cfg(feature = "compiled_data")]
pub const fn new_nfc() -> ComposingNormalizerBorrowed<'static> {
ComposingNormalizerBorrowed::new_nfc()
}
icu_provider::gen_buffer_data_constructors!(
() -> error: DataError,
functions: [
new_nfc: skip,
try_new_nfc_with_buffer_provider,
try_new_nfc_unstable,
Self,
]
);
#[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_nfc)]
pub fn try_new_nfc_unstable<D>(provider: &D) -> Result<Self, DataError>
where
D: DataProvider<NormalizerNfdDataV1>
+ DataProvider<NormalizerNfdTablesV1>
+ DataProvider<NormalizerNfcV1>
+ ?Sized,
{
let decomposing_normalizer = DecomposingNormalizer::try_new_nfd_unstable(provider)?;
let canonical_compositions: DataPayload<NormalizerNfcV1> =
provider.load(Default::default())?.payload;
Ok(ComposingNormalizer {
decomposing_normalizer,
canonical_compositions,
})
}
#[cfg(feature = "compiled_data")]
pub const fn new_nfkc() -> ComposingNormalizerBorrowed<'static> {
ComposingNormalizerBorrowed::new_nfkc()
}
icu_provider::gen_buffer_data_constructors!(
() -> error: DataError,
functions: [
new_nfkc: skip,
try_new_nfkc_with_buffer_provider,
try_new_nfkc_unstable,
Self,
]
);
#[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_nfkc)]
pub fn try_new_nfkc_unstable<D>(provider: &D) -> Result<Self, DataError>
where
D: DataProvider<NormalizerNfkdDataV1>
+ DataProvider<NormalizerNfdTablesV1>
+ DataProvider<NormalizerNfkdTablesV1>
+ DataProvider<NormalizerNfcV1>
+ ?Sized,
{
let decomposing_normalizer = DecomposingNormalizer::try_new_nfkd_unstable(provider)?;
let canonical_compositions: DataPayload<NormalizerNfcV1> =
provider.load(Default::default())?.payload;
Ok(ComposingNormalizer {
decomposing_normalizer,
canonical_compositions,
})
}
#[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_uts46)]
pub(crate) fn try_new_uts46_unstable<D>(provider: &D) -> Result<Self, DataError>
where
D: DataProvider<NormalizerUts46DataV1>
+ DataProvider<NormalizerNfdTablesV1>
+ DataProvider<NormalizerNfkdTablesV1>
+ DataProvider<NormalizerNfcV1>
+ ?Sized,
{
let decomposing_normalizer =
DecomposingNormalizer::try_new_uts46_decomposed_unstable(provider)?;
let canonical_compositions: DataPayload<NormalizerNfcV1> =
provider.load(Default::default())?.payload;
Ok(ComposingNormalizer {
decomposing_normalizer,
canonical_compositions,
})
}
}
#[cfg(feature = "utf16_iter")]
struct IsNormalizedSinkUtf16<'a> {
expect: &'a [u16],
}
#[cfg(feature = "utf16_iter")]
impl<'a> IsNormalizedSinkUtf16<'a> {
pub fn new(slice: &'a [u16]) -> Self {
IsNormalizedSinkUtf16 { expect: slice }
}
pub fn remaining_len(&self) -> usize {
self.expect.len()
}
}
#[cfg(feature = "utf16_iter")]
impl write16::Write16 for IsNormalizedSinkUtf16<'_> {
fn write_slice(&mut self, s: &[u16]) -> core::fmt::Result {
#[expect(clippy::indexing_slicing)]
if core::ptr::eq(s.as_ptr(), self.expect.as_ptr()) {
self.expect = &self.expect[s.len()..];
Ok(())
} else {
Err(core::fmt::Error {})
}
}
fn write_char(&mut self, c: char) -> core::fmt::Result {
let mut iter = self.expect.chars();
if iter.next() == Some(c) {
self.expect = iter.as_slice();
Ok(())
} else {
Err(core::fmt::Error {})
}
}
}
#[cfg(feature = "utf8_iter")]
struct IsNormalizedSinkUtf8<'a> {
expect: &'a [u8],
}
#[cfg(feature = "utf8_iter")]
impl<'a> IsNormalizedSinkUtf8<'a> {
pub fn new(slice: &'a [u8]) -> Self {
IsNormalizedSinkUtf8 { expect: slice }
}
pub fn remaining_len(&self) -> usize {
self.expect.len()
}
}
#[cfg(feature = "utf8_iter")]
impl core::fmt::Write for IsNormalizedSinkUtf8<'_> {
fn write_str(&mut self, s: &str) -> core::fmt::Result {
#[expect(clippy::indexing_slicing)]
if core::ptr::eq(s.as_ptr(), self.expect.as_ptr()) {
self.expect = &self.expect[s.len()..];
Ok(())
} else {
Err(core::fmt::Error {})
}
}
fn write_char(&mut self, c: char) -> core::fmt::Result {
let mut iter = self.expect.chars();
if iter.next() == Some(c) {
self.expect = iter.as_slice();
Ok(())
} else {
Err(core::fmt::Error {})
}
}
}
struct IsNormalizedSinkStr<'a> {
expect: &'a str,
}
impl<'a> IsNormalizedSinkStr<'a> {
pub fn new(slice: &'a str) -> Self {
IsNormalizedSinkStr { expect: slice }
}
pub fn remaining_len(&self) -> usize {
self.expect.len()
}
}
impl core::fmt::Write for IsNormalizedSinkStr<'_> {
fn write_str(&mut self, s: &str) -> core::fmt::Result {
if core::ptr::eq(s.as_ptr(), self.expect.as_ptr()) {
self.expect = &self.expect[s.len()..];
Ok(())
} else {
Err(core::fmt::Error {})
}
}
fn write_char(&mut self, c: char) -> core::fmt::Result {
let mut iter = self.expect.chars();
if iter.next() == Some(c) {
self.expect = iter.as_str();
Ok(())
} else {
Err(core::fmt::Error {})
}
}
}