icu_normalizer/
lib.rs

1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5// https://github.com/unicode-org/icu4x/blob/main/documents/process/boilerplate.md#library-annotations
6#![cfg_attr(not(any(test, doc)), no_std)]
7#![cfg_attr(
8    not(test),
9    deny(
10        clippy::indexing_slicing,
11        clippy::unwrap_used,
12        clippy::expect_used,
13        clippy::panic,
14    )
15)]
16#![warn(missing_docs)]
17
18//! Normalizing text into Unicode Normalization Forms.
19//!
20//! This module is published as its own crate ([`icu_normalizer`](https://docs.rs/icu_normalizer/latest/icu_normalizer/))
21//! and as part of the [`icu`](https://docs.rs/icu/latest/icu/) crate. See the latter for more details on the ICU4X project.
22//!
23//! # Functionality
24//!
25//! The top level of the crate provides normalization of input into the four normalization forms defined in [UAX #15: Unicode
26//! Normalization Forms](https://www.unicode.org/reports/tr15/): NFC, NFD, NFKC, and NFKD.
27//!
28//! Three kinds of contiguous inputs are supported: known-well-formed UTF-8 (`&str`), potentially-not-well-formed UTF-8,
29//! and potentially-not-well-formed UTF-16. Additionally, an iterator over `char` can be wrapped in a normalizing iterator.
30//!
31//! The `uts46` module provides the combination of mapping and normalization operations for [UTS #46: Unicode IDNA
32//! Compatibility Processing](https://www.unicode.org/reports/tr46/). This functionality is not meant to be used by
33//! applications directly. Instead, it is meant as a building block for a full implementation of UTS #46, such as the
34//! [`idna`](https://docs.rs/idna/latest/idna/) crate.
35//!
36//! The `properties` module provides the non-recursive canonical decomposition operation on a per `char` basis and
37//! the canonical compositon operation given two `char`s. It also provides access to the Canonical Combining Class
38//! property. These operations are primarily meant for [HarfBuzz](https://harfbuzz.github.io/), the types
39//! [`CanonicalComposition`](properties::CanonicalComposition), [`CanonicalDecomposition`](properties::CanonicalDecomposition),
40//! and [`CanonicalCombiningClassMap`](properties::CanonicalCombiningClassMap) implement the [`harfbuzz_traits`] if
41//! the `harfbuzz_traits` Cargo feature is enabled.
42//!
43//! Notably, this normalizer does _not_ provide the normalization “quick check” that can result in “maybe” in
44//! addition to “yes” and “no”. The normalization checks provided by this crate always give a definitive
45//! non-“maybe” answer.
46//!
47//! # Examples
48//!
49//! ```
50//! let nfc = icu_normalizer::ComposingNormalizerBorrowed::new_nfc();
51//! assert_eq!(nfc.normalize("a\u{0308}"), "ä");
52//! assert!(nfc.is_normalized("ä"));
53//!
54//! let nfd = icu_normalizer::DecomposingNormalizerBorrowed::new_nfd();
55//! assert_eq!(nfd.normalize("ä"), "a\u{0308}");
56//! assert!(!nfd.is_normalized("ä"));
57//! ```
58
59extern crate alloc;
60
61// TODO: The plan is to replace
62// `#[cfg(not(icu4x_unstable_fast_trie_only))]`
63// with
64// `#[cfg(feature = "serde")]`
65// and
66// `#[cfg(icu4x_unstable_fast_trie_only)]`
67// with
68// `#[cfg(not(feature = "serde"))]`
69//
70// Before doing so:
71// * The type of the UTS 46 trie needs to be
72//   disentangled from the type of the NFD/NFKD tries.
73//   This will involve a more generic iterator hidden
74//   inside the public iterator types.
75// * datagen needs to emit fast-mode tries for the
76//   NFD and NFKD tries.
77// * The markers and possibly the data struct type
78//   for NFD and NFKD need to be revised per policy.
79
80#[cfg(not(icu4x_unstable_fast_trie_only))]
81type Trie<'trie> = CodePointTrie<'trie, u32>;
82
83#[cfg(icu4x_unstable_fast_trie_only)]
84type Trie<'trie> = FastCodePointTrie<'trie, u32>;
85
86// We don't depend on icu_properties to minimize deps, but we want to be able
87// to ensure we're using the right CCC values
88macro_rules! ccc {
89    ($name:ident, $num:expr) => {
90        const {
91            #[cfg(feature = "icu_properties")]
92            if icu_properties::props::CanonicalCombiningClass::$name.to_icu4c_value() != $num {
93                panic!("icu_normalizer has incorrect ccc values")
94            }
95            CanonicalCombiningClass::from_icu4c_value($num)
96        }
97    };
98}
99
100#[cfg(feature = "harfbuzz_traits")]
101mod harfbuzz;
102pub mod properties;
103pub mod provider;
104pub mod uts46;
105
106use crate::provider::CanonicalCompositions;
107use crate::provider::DecompositionData;
108use crate::provider::NormalizerNfdDataV1;
109use crate::provider::NormalizerNfkdDataV1;
110use crate::provider::NormalizerUts46DataV1;
111use alloc::borrow::Cow;
112use alloc::string::String;
113use core::char::REPLACEMENT_CHARACTER;
114use icu_collections::char16trie::Char16Trie;
115use icu_collections::char16trie::Char16TrieIterator;
116use icu_collections::char16trie::TrieResult;
117#[cfg(not(icu4x_unstable_fast_trie_only))]
118use icu_collections::codepointtrie::CodePointTrie;
119#[cfg(icu4x_unstable_fast_trie_only)]
120use icu_collections::codepointtrie::FastCodePointTrie;
121#[cfg(icu4x_unstable_fast_trie_only)]
122use icu_collections::codepointtrie::TypedCodePointTrie;
123#[cfg(feature = "icu_properties")]
124use icu_properties::props::CanonicalCombiningClass;
125use icu_provider::prelude::*;
126use provider::DecompositionTables;
127use provider::NormalizerNfcV1;
128use provider::NormalizerNfdTablesV1;
129use provider::NormalizerNfkdTablesV1;
130use smallvec::SmallVec;
131#[cfg(feature = "utf16_iter")]
132use utf16_iter::Utf16CharsEx;
133#[cfg(feature = "utf8_iter")]
134use utf8_iter::Utf8CharsEx;
135use zerovec::{zeroslice, ZeroSlice};
136
137// The optimizations in the area where `likely` is used
138// are extremely brittle. `likely` is useful in the typed-trie
139// case on the UTF-16 fast path, but in order not to disturb
140// the untyped-trie case on the UTF-16 fast path, make the
141// annotations no-ops in the untyped-trie case.
142
143// `cold_path` and `likely` come from
144// https://github.com/rust-lang/hashbrown/commit/64bd7db1d1b148594edfde112cdb6d6260e2cfc3 .
145// See https://github.com/rust-lang/hashbrown/commit/64bd7db1d1b148594edfde112cdb6d6260e2cfc3#commitcomment-164768806
146// for permission to relicense under Unicode-3.0.
147
148#[cfg(all(icu4x_unstable_fast_trie_only, feature = "utf16_iter"))]
149#[inline(always)]
150#[cold]
151fn cold_path() {}
152
153#[cfg(all(icu4x_unstable_fast_trie_only, feature = "utf16_iter"))]
154#[inline(always)]
155pub(crate) fn likely(b: bool) -> bool {
156    if b {
157        true
158    } else {
159        cold_path();
160        false
161    }
162}
163
164// End import from https://github.com/rust-lang/hashbrown/commit/64bd7db1d1b148594edfde112cdb6d6260e2cfc3 .
165
166/// No-op for typed trie case.
167#[cfg(all(not(icu4x_unstable_fast_trie_only), feature = "utf16_iter"))]
168#[inline(always)]
169fn likely(b: bool) -> bool {
170    b
171}
172
173// This type exists as a shim for `icu_properties` `CanonicalCombiningClass` when the crate is disabled
174// It should not be exposed to users.
175#[cfg(not(feature = "icu_properties"))]
176#[derive(Copy, Clone, Eq, PartialEq, PartialOrd, Ord)]
177struct CanonicalCombiningClass(pub(crate) u8);
178
179#[cfg(not(feature = "icu_properties"))]
180impl CanonicalCombiningClass {
181    const fn from_icu4c_value(v: u8) -> Self {
182        Self(v)
183    }
184    const fn to_icu4c_value(self) -> u8 {
185        self.0
186    }
187}
188
189const CCC_NOT_REORDERED: CanonicalCombiningClass = ccc!(NotReordered, 0);
190const CCC_ABOVE: CanonicalCombiningClass = ccc!(Above, 230);
191
192/// Treatment of the ignorable marker (0xFFFFFFFF) in data.
193#[derive(Debug, PartialEq, Eq)]
194enum IgnorableBehavior {
195    /// 0xFFFFFFFF in data is not supported.
196    Unsupported,
197    /// Ignorables are ignored.
198    Ignored,
199    /// Ignorables are treated as singleton decompositions
200    /// to the REPLACEMENT CHARACTER.
201    ReplacementCharacter,
202}
203
204/// Marker for UTS 46 ignorables.
205///
206/// See trie-value-format.md
207const IGNORABLE_MARKER: u32 = 0xFFFFFFFF;
208
209/// Marker that the decomposition does not round trip via NFC.
210///
211/// See trie-value-format.md
212const NON_ROUND_TRIP_MARKER: u32 = 1 << 30;
213
214/// Marker that the first character of the decomposition
215/// can combine backwards.
216///
217/// See trie-value-format.md
218const BACKWARD_COMBINING_MARKER: u32 = 1 << 31;
219
220/// Mask for the bits have to be zero for this to be a BMP
221/// singleton decomposition, or value baked into the surrogate
222/// range.
223///
224/// See trie-value-format.md
225const HIGH_ZEROS_MASK: u32 = 0x3FFF0000;
226
227/// Mask for the bits have to be zero for this to be a complex
228/// decomposition.
229///
230/// See trie-value-format.md
231const LOW_ZEROS_MASK: u32 = 0xFFE0;
232
233/// Checks if a trie value carries a (non-zero) canonical
234/// combining class.
235///
236/// See trie-value-format.md
237#[inline]
238fn trie_value_has_ccc(trie_value: u32) -> bool {
239    (trie_value & 0x3FFFFE00) == 0xD800
240}
241
242/// Checks if the trie signifies a special non-starter decomposition.
243///
244/// See trie-value-format.md
245fn trie_value_indicates_special_non_starter_decomposition(trie_value: u32) -> bool {
246    (trie_value & 0x3FFFFF00) == 0xD900
247}
248
249/// Checks if a trie value signifies a character whose decomposition
250/// starts with a non-starter.
251///
252/// See trie-value-format.md
253fn decomposition_starts_with_non_starter(trie_value: u32) -> bool {
254    trie_value_has_ccc(trie_value)
255}
256
257/// Extracts a canonical combining class (possibly zero) from a trie value.
258///
259/// See trie-value-format.md
260fn ccc_from_trie_value(trie_value: u32) -> CanonicalCombiningClass {
261    if trie_value_has_ccc(trie_value) {
262        CanonicalCombiningClass::from_icu4c_value(trie_value as u8)
263    } else {
264        CCC_NOT_REORDERED
265    }
266}
267
268/// The tail (everything after the first character) of the NFKD form U+FDFA
269/// as 16-bit units.
270static FDFA_NFKD: [u16; 17] = [
271    0x644, 0x649, 0x20, 0x627, 0x644, 0x644, 0x647, 0x20, 0x639, 0x644, 0x64A, 0x647, 0x20, 0x648,
272    0x633, 0x644, 0x645,
273];
274
275/// Marker value for U+FDFA in NFKD. (Unified with Hangul syllable marker,
276/// but they differ by `NON_ROUND_TRIP_MARKER`.)
277///
278/// See trie-value-format.md
279const FDFA_MARKER: u16 = 1;
280
281// These constants originate from page 143 of Unicode 14.0
282/// Syllable base
283const HANGUL_S_BASE: u32 = 0xAC00;
284/// Lead jamo base
285const HANGUL_L_BASE: u32 = 0x1100;
286/// Vowel jamo base
287const HANGUL_V_BASE: u32 = 0x1161;
288/// Trail jamo base (deliberately off by one to account for the absence of a trail)
289const HANGUL_T_BASE: u32 = 0x11A7;
290/// Lead jamo count
291const HANGUL_L_COUNT: u32 = 19;
292/// Vowel jamo count
293const HANGUL_V_COUNT: u32 = 21;
294/// Trail jamo count (deliberately off by one to account for the absence of a trail)
295const HANGUL_T_COUNT: u32 = 28;
296/// Vowel jamo count times trail jamo count
297const HANGUL_N_COUNT: u32 = 588;
298/// Syllable count
299const HANGUL_S_COUNT: u32 = 11172;
300
301/// One past the conjoining jamo block
302const HANGUL_JAMO_LIMIT: u32 = 0x1200;
303
304/// If `opt` is `Some`, unwrap it. If `None`, panic if debug assertions
305/// are enabled and return `default` if debug assertions are not enabled.
306///
307/// Use this only if the only reason why `opt` could be `None` is bogus
308/// data from the provider.
309#[inline(always)]
310fn unwrap_or_gigo<T>(opt: Option<T>, default: T) -> T {
311    if let Some(val) = opt {
312        val
313    } else {
314        // GIGO case
315        debug_assert!(false);
316        default
317    }
318}
319
320/// Convert a `u32` _obtained from data provider data_ to `char`.
321#[inline(always)]
322fn char_from_u32(u: u32) -> char {
323    unwrap_or_gigo(core::char::from_u32(u), REPLACEMENT_CHARACTER)
324}
325
326/// Convert a `u16` _obtained from data provider data_ to `char`.
327#[inline(always)]
328fn char_from_u16(u: u16) -> char {
329    char_from_u32(u32::from(u))
330}
331
332const EMPTY_U16: &ZeroSlice<u16> = zeroslice![];
333
334const EMPTY_CHAR: &ZeroSlice<char> = zeroslice![];
335
336#[inline(always)]
337fn in_inclusive_range(c: char, start: char, end: char) -> bool {
338    u32::from(c).wrapping_sub(u32::from(start)) <= (u32::from(end) - u32::from(start))
339}
340
341#[inline(always)]
342#[cfg(feature = "utf16_iter")]
343fn in_inclusive_range16(u: u16, start: u16, end: u16) -> bool {
344    u.wrapping_sub(start) <= (end - start)
345}
346
347/// Performs canonical composition (including Hangul) on a pair of
348/// characters or returns `None` if these characters don't compose.
349/// Composition exclusions are taken into account.
350#[inline]
351fn compose(iter: Char16TrieIterator, starter: char, second: char) -> Option<char> {
352    let v = u32::from(second).wrapping_sub(HANGUL_V_BASE);
353    if v >= HANGUL_JAMO_LIMIT - HANGUL_V_BASE {
354        return compose_non_hangul(iter, starter, second);
355    }
356    if v < HANGUL_V_COUNT {
357        let l = u32::from(starter).wrapping_sub(HANGUL_L_BASE);
358        if l < HANGUL_L_COUNT {
359            let lv = l * HANGUL_N_COUNT + v * HANGUL_T_COUNT;
360            // Safe, because the inputs are known to be in range.
361            return Some(unsafe { char::from_u32_unchecked(HANGUL_S_BASE + lv) });
362        }
363        return None;
364    }
365    if in_inclusive_range(second, '\u{11A8}', '\u{11C2}') {
366        let lv = u32::from(starter).wrapping_sub(HANGUL_S_BASE);
367        if lv < HANGUL_S_COUNT && lv % HANGUL_T_COUNT == 0 {
368            let lvt = lv + (u32::from(second) - HANGUL_T_BASE);
369            // Safe, because the inputs are known to be in range.
370            return Some(unsafe { char::from_u32_unchecked(HANGUL_S_BASE + lvt) });
371        }
372    }
373    None
374}
375
376/// Performs (non-Hangul) canonical composition on a pair of characters
377/// or returns `None` if these characters don't compose. Composition
378/// exclusions are taken into account.
379fn compose_non_hangul(mut iter: Char16TrieIterator, starter: char, second: char) -> Option<char> {
380    // To make the trie smaller, the pairs are stored second character first.
381    // Given how this method is used in ways where it's known that `second`
382    // is or isn't a starter. We could potentially split the trie into two
383    // tries depending on whether `second` is a starter.
384    match iter.next(second) {
385        TrieResult::NoMatch => None,
386        TrieResult::NoValue => match iter.next(starter) {
387            TrieResult::NoMatch => None,
388            TrieResult::FinalValue(i) => {
389                if let Some(c) = char::from_u32(i as u32) {
390                    Some(c)
391                } else {
392                    // GIGO case
393                    debug_assert!(false);
394                    None
395                }
396            }
397            TrieResult::NoValue | TrieResult::Intermediate(_) => {
398                // GIGO case
399                debug_assert!(false);
400                None
401            }
402        },
403        TrieResult::FinalValue(_) | TrieResult::Intermediate(_) => {
404            // GIGO case
405            debug_assert!(false);
406            None
407        }
408    }
409}
410
411/// See trie-value-format.md
412#[inline(always)]
413fn starter_and_decomposes_to_self_impl(trie_val: u32) -> bool {
414    // The REPLACEMENT CHARACTER has `NON_ROUND_TRIP_MARKER` set,
415    // and this function needs to ignore that.
416    (trie_val & !(BACKWARD_COMBINING_MARKER | NON_ROUND_TRIP_MARKER)) == 0
417}
418
419/// See trie-value-format.md
420#[inline(always)]
421fn potential_passthrough_and_cannot_combine_backwards_impl(trie_val: u32) -> bool {
422    (trie_val & (NON_ROUND_TRIP_MARKER | BACKWARD_COMBINING_MARKER)) == 0
423}
424
425/// Struct for holding together a character and the value
426/// looked up for it from the NFD trie in a more explicit
427/// way than an anonymous pair.
428/// Also holds a flag about the supplementary-trie provenance.
429#[derive(Debug, PartialEq, Eq)]
430struct CharacterAndTrieValue {
431    character: char,
432    /// See trie-value-format.md
433    trie_val: u32,
434}
435
436impl CharacterAndTrieValue {
437    #[inline(always)]
438    pub fn new(c: char, trie_value: u32) -> Self {
439        CharacterAndTrieValue {
440            character: c,
441            trie_val: trie_value,
442        }
443    }
444
445    #[inline(always)]
446    pub fn starter_and_decomposes_to_self(&self) -> bool {
447        starter_and_decomposes_to_self_impl(self.trie_val)
448    }
449
450    /// See trie-value-format.md
451    #[inline(always)]
452    #[cfg(feature = "utf8_iter")]
453    pub fn starter_and_decomposes_to_self_except_replacement(&self) -> bool {
454        // This intentionally leaves `NON_ROUND_TRIP_MARKER` in the value
455        // to be compared with zero. U+FFFD has that flag set despite really
456        // being being round-tripping in order to make UTF-8 errors
457        // ineligible for passthrough.
458        (self.trie_val & !BACKWARD_COMBINING_MARKER) == 0
459    }
460
461    /// See trie-value-format.md
462    #[inline(always)]
463    pub fn can_combine_backwards(&self) -> bool {
464        (self.trie_val & BACKWARD_COMBINING_MARKER) != 0
465    }
466    /// See trie-value-format.md
467    #[inline(always)]
468    pub fn potential_passthrough(&self) -> bool {
469        (self.trie_val & NON_ROUND_TRIP_MARKER) == 0
470    }
471    /// See trie-value-format.md
472    #[inline(always)]
473    pub fn potential_passthrough_and_cannot_combine_backwards(&self) -> bool {
474        potential_passthrough_and_cannot_combine_backwards_impl(self.trie_val)
475    }
476}
477
478/// Pack a `char` and a `CanonicalCombiningClass` in
479/// 32 bits (the former in the lower 24 bits and the
480/// latter in the high 8 bits). The latter can be
481/// initialized to 0xFF upon creation, in which case
482/// it can be actually set later by calling
483/// `set_ccc_from_trie_if_not_already_set`. This is
484/// a micro optimization to avoid the Canonical
485/// Combining Class trie lookup when there is only
486/// one combining character in a sequence. This type
487/// is intentionally non-`Copy` to get compiler help
488/// in making sure that the class is set on the
489/// instance on which it is intended to be set
490/// and not on a temporary copy.
491///
492/// Note that 0xFF is won't be assigned to an actual
493/// canonical combining class per definition D104
494/// in The Unicode Standard.
495//
496// NOTE: The Pernosco debugger has special knowledge
497// of this struct. Please do not change the bit layout
498// or the crate-module-qualified name of this struct
499// without coordination.
500#[derive(Debug)]
501struct CharacterAndClass(u32);
502
503impl CharacterAndClass {
504    pub fn new(c: char, ccc: CanonicalCombiningClass) -> Self {
505        CharacterAndClass(u32::from(c) | (u32::from(ccc.to_icu4c_value()) << 24))
506    }
507    pub fn new_with_placeholder(c: char) -> Self {
508        CharacterAndClass(u32::from(c) | ((0xFF) << 24))
509    }
510    pub fn new_with_trie_value(c_tv: CharacterAndTrieValue) -> Self {
511        Self::new(c_tv.character, ccc_from_trie_value(c_tv.trie_val))
512    }
513    pub fn new_starter(c: char) -> Self {
514        CharacterAndClass(u32::from(c))
515    }
516    /// This method must exist for Pernosco to apply its special rendering.
517    /// Also, this must not be dead code!
518    pub fn character(&self) -> char {
519        // Safe, because the low 24 bits came from a `char`
520        // originally.
521        unsafe { char::from_u32_unchecked(self.0 & 0xFFFFFF) }
522    }
523    /// This method must exist for Pernosco to apply its special rendering.
524    pub fn ccc(&self) -> CanonicalCombiningClass {
525        CanonicalCombiningClass::from_icu4c_value((self.0 >> 24) as u8)
526    }
527
528    pub fn character_and_ccc(&self) -> (char, CanonicalCombiningClass) {
529        (self.character(), self.ccc())
530    }
531    pub fn set_ccc_from_trie_if_not_already_set(&mut self, trie: &Trie) {
532        if self.0 >> 24 != 0xFF {
533            return;
534        }
535        let scalar = self.0 & 0xFFFFFF;
536        self.0 =
537            ((ccc_from_trie_value(trie.get32_u32(scalar)).to_icu4c_value() as u32) << 24) | scalar;
538    }
539}
540
541// This function exists as a borrow check helper.
542#[inline(always)]
543fn sort_slice_by_ccc(slice: &mut [CharacterAndClass], trie: &Trie) {
544    // We don't look up the canonical combining class for starters
545    // of for single combining characters between starters. When
546    // there's more than one combining character between starters,
547    // we look up the canonical combining class for each character
548    // exactly once.
549    if slice.len() < 2 {
550        return;
551    }
552    slice
553        .iter_mut()
554        .for_each(|cc| cc.set_ccc_from_trie_if_not_already_set(trie));
555    slice.sort_by_key(|cc| cc.ccc());
556}
557
558/// An iterator adaptor that turns an `Iterator` over `char` into
559/// a lazily-decomposed `char` sequence.
560#[derive(Debug)]
561pub struct Decomposition<'data, I>
562where
563    I: Iterator<Item = char>,
564{
565    delegate: I,
566    buffer: SmallVec<[CharacterAndClass; 17]>, // Enough to hold NFKD for U+FDFA
567    /// The index of the next item to be read from `buffer`.
568    /// The purpose if this index is to avoid having to move
569    /// the rest upon every read.
570    buffer_pos: usize,
571    // At the start of `next()` if not `None`, this is a pending unnormalized
572    // starter. When `Decomposition` appears alone, this is never a non-starter.
573    // However, when `Decomposition` appears inside a `Composition`, this
574    // may become a non-starter before `decomposing_next()` is called.
575    pending: Option<CharacterAndTrieValue>, // None at end of stream
576    // See trie-value-format.md
577    trie: &'data Trie<'data>,
578    scalars16: &'data ZeroSlice<u16>,
579    scalars24: &'data ZeroSlice<char>,
580    supplementary_scalars16: &'data ZeroSlice<u16>,
581    supplementary_scalars24: &'data ZeroSlice<char>,
582    /// The lowest character for which either of the following does
583    /// not hold:
584    /// 1. Decomposes to self.
585    /// 2. Decomposition starts with a non-starter
586    decomposition_passthrough_bound: u32, // never above 0xC0
587    ignorable_behavior: IgnorableBehavior, // Arguably should be a type parameter
588}
589
590impl<'data, I> Decomposition<'data, I>
591where
592    I: Iterator<Item = char>,
593{
594    /// Constructs a decomposing iterator adapter from a delegate
595    /// iterator and references to the necessary data, without
596    /// supplementary data.
597    ///
598    /// Use `DecomposingNormalizer::normalize_iter()` instead unless
599    /// there's a good reason to use this constructor directly.
600    ///
601    /// Public but hidden in order to be able to use this from the
602    /// collator.
603    #[doc(hidden)] // used in collator
604    pub fn new(
605        delegate: I,
606        decompositions: &'data DecompositionData,
607        tables: &'data DecompositionTables,
608    ) -> Self {
609        Self::new_with_supplements(
610            delegate,
611            decompositions,
612            tables,
613            None,
614            0xC0,
615            IgnorableBehavior::Unsupported,
616        )
617    }
618
619    /// Constructs a decomposing iterator adapter from a delegate
620    /// iterator and references to the necessary data, including
621    /// supplementary data.
622    ///
623    /// Use `DecomposingNormalizer::normalize_iter()` instead unless
624    /// there's a good reason to use this constructor directly.
625    fn new_with_supplements(
626        delegate: I,
627        decompositions: &'data DecompositionData,
628        tables: &'data DecompositionTables,
629        supplementary_tables: Option<&'data DecompositionTables>,
630        decomposition_passthrough_bound: u8,
631        ignorable_behavior: IgnorableBehavior,
632    ) -> Self {
633        let mut ret = Decomposition::<I> {
634            delegate,
635            buffer: SmallVec::new(), // Normalized
636            buffer_pos: 0,
637            // Initialize with a placeholder starter in case
638            // the real stream starts with a non-starter.
639            pending: Some(CharacterAndTrieValue::new('\u{FFFF}', 0)),
640            #[allow(clippy::useless_conversion, clippy::expect_used)] // Expectation always succeeds when untyped tries are in use
641            trie: <&Trie>::try_from(&decompositions.trie).expect("Unexpected trie type in data"),
642            scalars16: &tables.scalars16,
643            scalars24: &tables.scalars24,
644            supplementary_scalars16: if let Some(supplementary) = supplementary_tables {
645                &supplementary.scalars16
646            } else {
647                EMPTY_U16
648            },
649            supplementary_scalars24: if let Some(supplementary) = supplementary_tables {
650                &supplementary.scalars24
651            } else {
652                EMPTY_CHAR
653            },
654            decomposition_passthrough_bound: u32::from(decomposition_passthrough_bound),
655            ignorable_behavior,
656        };
657        let _ = ret.next(); // Remove the U+FFFF placeholder
658        ret
659    }
660
661    fn push_decomposition16(
662        &mut self,
663        offset: usize,
664        len: usize,
665        only_non_starters_in_trail: bool,
666        slice16: &ZeroSlice<u16>,
667    ) -> (char, usize) {
668        let (starter, tail) = slice16
669            .get_subslice(offset..offset + len)
670            .and_then(|slice| slice.split_first())
671            .map_or_else(
672                || {
673                    // GIGO case
674                    debug_assert!(false);
675                    (REPLACEMENT_CHARACTER, EMPTY_U16)
676                },
677                |(first, trail)| (char_from_u16(first), trail),
678            );
679        if only_non_starters_in_trail {
680            // All the rest are combining
681            self.buffer.extend(
682                tail.iter()
683                    .map(|u| CharacterAndClass::new_with_placeholder(char_from_u16(u))),
684            );
685            (starter, 0)
686        } else {
687            let mut i = 0;
688            let mut combining_start = 0;
689            for u in tail.iter() {
690                let ch = char_from_u16(u);
691                let trie_value = self.trie.get(ch);
692                self.buffer.push(CharacterAndClass::new_with_trie_value(
693                    CharacterAndTrieValue::new(ch, trie_value),
694                ));
695                i += 1;
696                // Half-width kana and iota subscript don't occur in the tails
697                // of these multicharacter decompositions.
698                if !decomposition_starts_with_non_starter(trie_value) {
699                    combining_start = i;
700                }
701            }
702            (starter, combining_start)
703        }
704    }
705
706    fn push_decomposition32(
707        &mut self,
708        offset: usize,
709        len: usize,
710        only_non_starters_in_trail: bool,
711        slice32: &ZeroSlice<char>,
712    ) -> (char, usize) {
713        let (starter, tail) = slice32
714            .get_subslice(offset..offset + len)
715            .and_then(|slice| slice.split_first())
716            .unwrap_or_else(|| {
717                // GIGO case
718                debug_assert!(false);
719                (REPLACEMENT_CHARACTER, EMPTY_CHAR)
720            });
721        if only_non_starters_in_trail {
722            // All the rest are combining
723            self.buffer
724                .extend(tail.iter().map(CharacterAndClass::new_with_placeholder));
725            (starter, 0)
726        } else {
727            let mut i = 0;
728            let mut combining_start = 0;
729            for ch in tail.iter() {
730                let trie_value = self.trie.get(ch);
731                self.buffer.push(CharacterAndClass::new_with_trie_value(
732                    CharacterAndTrieValue::new(ch, trie_value),
733                ));
734                i += 1;
735                // Half-width kana and iota subscript don't occur in the tails
736                // of these multicharacter decompositions.
737                if !decomposition_starts_with_non_starter(trie_value) {
738                    combining_start = i;
739                }
740            }
741            (starter, combining_start)
742        }
743    }
744
745    #[inline(always)]
746    fn attach_trie_value(&self, c: char) -> CharacterAndTrieValue {
747        CharacterAndTrieValue::new(c, self.trie.get(c))
748    }
749
750    fn delegate_next_no_pending(&mut self) -> Option<CharacterAndTrieValue> {
751        debug_assert!(self.pending.is_none());
752        loop {
753            let c = self.delegate.next()?;
754
755            // TODO(#2384): Measure if this check is actually an optimization.
756            if u32::from(c) < self.decomposition_passthrough_bound {
757                return Some(CharacterAndTrieValue::new(c, 0));
758            }
759
760            let trie_val = self.trie.get(c);
761            // TODO: Can we do something better about the cost of this branch in the
762            // non-UTS 46 case?
763            if trie_val == IGNORABLE_MARKER {
764                match self.ignorable_behavior {
765                    IgnorableBehavior::Unsupported => {
766                        debug_assert!(false);
767                    }
768                    IgnorableBehavior::ReplacementCharacter => {
769                        return Some(CharacterAndTrieValue::new(
770                            c,
771                            u32::from(REPLACEMENT_CHARACTER) | NON_ROUND_TRIP_MARKER,
772                        ));
773                    }
774                    IgnorableBehavior::Ignored => {
775                        // Else ignore this character by reading the next one from the delegate.
776                        continue;
777                    }
778                }
779            }
780            return Some(CharacterAndTrieValue::new(c, trie_val));
781        }
782    }
783
784    fn delegate_next(&mut self) -> Option<CharacterAndTrieValue> {
785        if let Some(pending) = self.pending.take() {
786            // Only happens as part of `Composition` and as part of
787            // the contiguous-buffer methods of `DecomposingNormalizer`.
788            // I.e. does not happen as part of standalone iterator
789            // usage of `Decomposition`.
790            Some(pending)
791        } else {
792            self.delegate_next_no_pending()
793        }
794    }
795
796    fn decomposing_next(&mut self, c_and_trie_val: CharacterAndTrieValue) -> char {
797        let (starter, combining_start) = {
798            let c = c_and_trie_val.character;
799            // See trie-value-format.md
800            let decomposition = c_and_trie_val.trie_val;
801            // The REPLACEMENT CHARACTER has `NON_ROUND_TRIP_MARKER` set,
802            // and that flag needs to be ignored here.
803            if (decomposition & !(BACKWARD_COMBINING_MARKER | NON_ROUND_TRIP_MARKER)) == 0 {
804                // The character is its own decomposition
805                (c, 0)
806            } else {
807                let high_zeros = (decomposition & HIGH_ZEROS_MASK) == 0;
808                let low_zeros = (decomposition & LOW_ZEROS_MASK) == 0;
809                if !high_zeros && !low_zeros {
810                    // Decomposition into two BMP characters: starter and non-starter
811                    let starter = char_from_u32(decomposition & 0x7FFF);
812                    let combining = char_from_u32((decomposition >> 15) & 0x7FFF);
813                    self.buffer
814                        .push(CharacterAndClass::new_with_placeholder(combining));
815                    (starter, 0)
816                } else if high_zeros {
817                    // Do the check by looking at `c` instead of looking at a marker
818                    // in `singleton` below, because if we looked at the trie value,
819                    // we'd still have to check that `c` is in the Hangul syllable
820                    // range in order for the subsequent interpretations as `char`
821                    // to be safe.
822                    // Alternatively, `FDFA_MARKER` and the Hangul marker could
823                    // be unified. That would add a branch for Hangul and remove
824                    // a branch from singleton decompositions. It seems more
825                    // important to favor Hangul syllables than singleton
826                    // decompositions.
827                    // Note that it would be valid to hoist this Hangul check
828                    // one or even two steps earlier in this check hierarchy.
829                    // Right now, it's assumed the kind of decompositions into
830                    // BMP starter and non-starter, which occur in many languages,
831                    // should be checked before Hangul syllables, which are about
832                    // one language specifically. Hopefully, we get some
833                    // instruction-level parallelism out of the disjointness of
834                    // operations on `c` and `decomposition`.
835                    let hangul_offset = u32::from(c).wrapping_sub(HANGUL_S_BASE); // SIndex in the spec
836                    if hangul_offset < HANGUL_S_COUNT {
837                        debug_assert_eq!(decomposition, 1);
838                        // Hangul syllable
839                        // The math here comes from page 144 of Unicode 14.0
840                        let l = hangul_offset / HANGUL_N_COUNT;
841                        let v = (hangul_offset % HANGUL_N_COUNT) / HANGUL_T_COUNT;
842                        let t = hangul_offset % HANGUL_T_COUNT;
843
844                        // The unsafe blocks here are OK, because the values stay
845                        // within the Hangul jamo block and, therefore, the scalar
846                        // value range by construction.
847                        self.buffer.push(CharacterAndClass::new_starter(unsafe {
848                            core::char::from_u32_unchecked(HANGUL_V_BASE + v)
849                        }));
850                        let first = unsafe { core::char::from_u32_unchecked(HANGUL_L_BASE + l) };
851                        if t != 0 {
852                            self.buffer.push(CharacterAndClass::new_starter(unsafe {
853                                core::char::from_u32_unchecked(HANGUL_T_BASE + t)
854                            }));
855                            (first, 2)
856                        } else {
857                            (first, 1)
858                        }
859                    } else {
860                        let singleton = decomposition as u16;
861                        if singleton != FDFA_MARKER {
862                            // Decomposition into one BMP character
863                            let starter = char_from_u16(singleton);
864                            (starter, 0)
865                        } else {
866                            // Special case for the NFKD form of U+FDFA.
867                            self.buffer.extend(FDFA_NFKD.map(|u| {
868                                // SAFETY: `FDFA_NFKD` is known not to contain
869                                // surrogates.
870                                CharacterAndClass::new_starter(unsafe {
871                                    core::char::from_u32_unchecked(u32::from(u))
872                                })
873                            }));
874                            ('\u{0635}', 17)
875                        }
876                    }
877                } else {
878                    debug_assert!(low_zeros);
879                    // Only 12 of 14 bits used as of Unicode 16.
880                    let offset = (((decomposition & !(0b11 << 30)) >> 16) as usize) - 1;
881                    // Only 3 of 4 bits used as of Unicode 16.
882                    let len_bits = decomposition & 0b1111;
883                    let only_non_starters_in_trail = (decomposition & 0b10000) != 0;
884                    if offset < self.scalars16.len() {
885                        self.push_decomposition16(
886                            offset,
887                            (len_bits + 2) as usize,
888                            only_non_starters_in_trail,
889                            self.scalars16,
890                        )
891                    } else if offset < self.scalars16.len() + self.scalars24.len() {
892                        self.push_decomposition32(
893                            offset - self.scalars16.len(),
894                            (len_bits + 1) as usize,
895                            only_non_starters_in_trail,
896                            self.scalars24,
897                        )
898                    } else if offset
899                        < self.scalars16.len()
900                            + self.scalars24.len()
901                            + self.supplementary_scalars16.len()
902                    {
903                        self.push_decomposition16(
904                            offset - (self.scalars16.len() + self.scalars24.len()),
905                            (len_bits + 2) as usize,
906                            only_non_starters_in_trail,
907                            self.supplementary_scalars16,
908                        )
909                    } else {
910                        self.push_decomposition32(
911                            offset
912                                - (self.scalars16.len()
913                                    + self.scalars24.len()
914                                    + self.supplementary_scalars16.len()),
915                            (len_bits + 1) as usize,
916                            only_non_starters_in_trail,
917                            self.supplementary_scalars24,
918                        )
919                    }
920                }
921            }
922        };
923        // Either we're inside `Composition` or `self.pending.is_none()`.
924
925        self.gather_and_sort_combining(combining_start);
926        starter
927    }
928
929    fn gather_and_sort_combining(&mut self, combining_start: usize) {
930        // Not a `for` loop to avoid holding a mutable reference to `self` across
931        // the loop body.
932        while let Some(ch_and_trie_val) = self.delegate_next() {
933            if !trie_value_has_ccc(ch_and_trie_val.trie_val) {
934                self.pending = Some(ch_and_trie_val);
935                break;
936            } else if !trie_value_indicates_special_non_starter_decomposition(
937                ch_and_trie_val.trie_val,
938            ) {
939                self.buffer
940                    .push(CharacterAndClass::new_with_trie_value(ch_and_trie_val));
941            } else {
942                // The Tibetan special cases are starters that decompose into non-starters.
943                let mapped = match ch_and_trie_val.character {
944                    '\u{0340}' => {
945                        // COMBINING GRAVE TONE MARK
946                        CharacterAndClass::new('\u{0300}', CCC_ABOVE)
947                    }
948                    '\u{0341}' => {
949                        // COMBINING ACUTE TONE MARK
950                        CharacterAndClass::new('\u{0301}', CCC_ABOVE)
951                    }
952                    '\u{0343}' => {
953                        // COMBINING GREEK KORONIS
954                        CharacterAndClass::new('\u{0313}', CCC_ABOVE)
955                    }
956                    '\u{0344}' => {
957                        // COMBINING GREEK DIALYTIKA TONOS
958                        self.buffer
959                            .push(CharacterAndClass::new('\u{0308}', CCC_ABOVE));
960                        CharacterAndClass::new('\u{0301}', CCC_ABOVE)
961                    }
962                    '\u{0F73}' => {
963                        // TIBETAN VOWEL SIGN II
964                        self.buffer
965                            .push(CharacterAndClass::new('\u{0F71}', ccc!(CCC129, 129)));
966                        CharacterAndClass::new('\u{0F72}', ccc!(CCC130, 130))
967                    }
968                    '\u{0F75}' => {
969                        // TIBETAN VOWEL SIGN UU
970                        self.buffer
971                            .push(CharacterAndClass::new('\u{0F71}', ccc!(CCC129, 129)));
972                        CharacterAndClass::new('\u{0F74}', ccc!(CCC132, 132))
973                    }
974                    '\u{0F81}' => {
975                        // TIBETAN VOWEL SIGN REVERSED II
976                        self.buffer
977                            .push(CharacterAndClass::new('\u{0F71}', ccc!(CCC129, 129)));
978                        CharacterAndClass::new('\u{0F80}', ccc!(CCC130, 130))
979                    }
980                    '\u{FF9E}' => {
981                        // HALFWIDTH KATAKANA VOICED SOUND MARK
982                        CharacterAndClass::new('\u{3099}', ccc!(KanaVoicing, 8))
983                    }
984                    '\u{FF9F}' => {
985                        // HALFWIDTH KATAKANA VOICED SOUND MARK
986                        CharacterAndClass::new('\u{309A}', ccc!(KanaVoicing, 8))
987                    }
988                    _ => {
989                        // GIGO case
990                        debug_assert!(false);
991                        CharacterAndClass::new_with_placeholder(REPLACEMENT_CHARACTER)
992                    }
993                };
994                self.buffer.push(mapped);
995            }
996        }
997        // Slicing succeeds by construction; we've always ensured that `combining_start`
998        // is in permissible range.
999        #[expect(clippy::indexing_slicing)]
1000        sort_slice_by_ccc(&mut self.buffer[combining_start..], self.trie);
1001    }
1002}
1003
1004impl<I> Iterator for Decomposition<'_, I>
1005where
1006    I: Iterator<Item = char>,
1007{
1008    type Item = char;
1009
1010    fn next(&mut self) -> Option<char> {
1011        if let Some(ret) = self.buffer.get(self.buffer_pos).map(|c| c.character()) {
1012            self.buffer_pos += 1;
1013            if self.buffer_pos == self.buffer.len() {
1014                self.buffer.clear();
1015                self.buffer_pos = 0;
1016            }
1017            return Some(ret);
1018        }
1019        debug_assert_eq!(self.buffer_pos, 0);
1020        let c_and_trie_val = self.pending.take()?;
1021        Some(self.decomposing_next(c_and_trie_val))
1022    }
1023}
1024
1025/// An iterator adaptor that turns an `Iterator` over `char` into
1026/// a lazily-decomposed and then canonically composed `char` sequence.
1027#[derive(Debug)]
1028pub struct Composition<'data, I>
1029where
1030    I: Iterator<Item = char>,
1031{
1032    /// The decomposing part of the normalizer than operates before
1033    /// the canonical composition is performed on its output.
1034    decomposition: Decomposition<'data, I>,
1035    /// Non-Hangul canonical composition data.
1036    canonical_compositions: Char16Trie<'data>,
1037    /// To make `next()` yield in cases where there's a non-composing
1038    /// starter in the decomposition buffer, we put it here to let it
1039    /// wait for the next `next()` call (or a jump forward within the
1040    /// `next()` call).
1041    unprocessed_starter: Option<char>,
1042    /// The lowest character for which any one of the following does
1043    /// not hold:
1044    /// 1. Roundtrips via decomposition and recomposition.
1045    /// 2. Decomposition starts with a non-starter
1046    /// 3. Is not a backward-combining starter
1047    composition_passthrough_bound: u32,
1048}
1049
1050impl<'data, I> Composition<'data, I>
1051where
1052    I: Iterator<Item = char>,
1053{
1054    fn new(
1055        decomposition: Decomposition<'data, I>,
1056        canonical_compositions: Char16Trie<'data>,
1057        composition_passthrough_bound: u16,
1058    ) -> Self {
1059        Self {
1060            decomposition,
1061            canonical_compositions,
1062            unprocessed_starter: None,
1063            composition_passthrough_bound: u32::from(composition_passthrough_bound),
1064        }
1065    }
1066
1067    /// Performs canonical composition (including Hangul) on a pair of
1068    /// characters or returns `None` if these characters don't compose.
1069    /// Composition exclusions are taken into account.
1070    #[inline(always)]
1071    pub fn compose(&self, starter: char, second: char) -> Option<char> {
1072        compose(self.canonical_compositions.iter(), starter, second)
1073    }
1074
1075    /// Performs (non-Hangul) canonical composition on a pair of characters
1076    /// or returns `None` if these characters don't compose. Composition
1077    /// exclusions are taken into account.
1078    #[inline(always)]
1079    fn compose_non_hangul(&self, starter: char, second: char) -> Option<char> {
1080        compose_non_hangul(self.canonical_compositions.iter(), starter, second)
1081    }
1082}
1083
1084impl<I> Iterator for Composition<'_, I>
1085where
1086    I: Iterator<Item = char>,
1087{
1088    type Item = char;
1089
1090    #[inline]
1091    fn next(&mut self) -> Option<char> {
1092        let mut undecomposed_starter = CharacterAndTrieValue::new('\u{0}', 0); // The compiler can't figure out that this gets overwritten before use.
1093        if self.unprocessed_starter.is_none() {
1094            // The loop is only broken out of as goto forward
1095            #[expect(clippy::never_loop)]
1096            loop {
1097                if let Some((character, ccc)) = self
1098                    .decomposition
1099                    .buffer
1100                    .get(self.decomposition.buffer_pos)
1101                    .map(|c| c.character_and_ccc())
1102                {
1103                    self.decomposition.buffer_pos += 1;
1104                    if self.decomposition.buffer_pos == self.decomposition.buffer.len() {
1105                        self.decomposition.buffer.clear();
1106                        self.decomposition.buffer_pos = 0;
1107                    }
1108                    if ccc == CCC_NOT_REORDERED {
1109                        // Previous decomposition contains a starter. This must
1110                        // now become the `unprocessed_starter` for it to have
1111                        // a chance to compose with the upcoming characters.
1112                        //
1113                        // E.g. parenthesized Hangul in NFKC comes through here,
1114                        // but suitable composition exclusion could exercise this
1115                        // in NFC.
1116                        self.unprocessed_starter = Some(character);
1117                        break; // We already have a starter, so skip taking one from `pending`.
1118                    }
1119                    return Some(character);
1120                }
1121                debug_assert_eq!(self.decomposition.buffer_pos, 0);
1122                undecomposed_starter = self.decomposition.pending.take()?;
1123                if u32::from(undecomposed_starter.character) < self.composition_passthrough_bound
1124                    || undecomposed_starter.potential_passthrough()
1125                {
1126                    // TODO(#2385): In the NFC case (moot for NFKC and UTS46), if the upcoming
1127                    // character is not below `decomposition_passthrough_bound` but is
1128                    // below `composition_passthrough_bound`, we read from the trie
1129                    // unnecessarily.
1130                    if let Some(upcoming) = self.decomposition.delegate_next_no_pending() {
1131                        let cannot_combine_backwards = u32::from(upcoming.character)
1132                            < self.composition_passthrough_bound
1133                            || !upcoming.can_combine_backwards();
1134                        self.decomposition.pending = Some(upcoming);
1135                        if cannot_combine_backwards {
1136                            // Fast-track succeeded!
1137                            return Some(undecomposed_starter.character);
1138                        }
1139                    } else {
1140                        // End of stream
1141                        return Some(undecomposed_starter.character);
1142                    }
1143                }
1144                break; // Not actually looping
1145            }
1146        }
1147        let mut starter = '\u{0}'; // The compiler can't figure out this gets overwritten before use.
1148
1149        // The point of having this boolean is to have only one call site to
1150        // `self.decomposition.decomposing_next`, which is hopefully beneficial for
1151        // code size under inlining.
1152        let mut attempt_composition = false;
1153        loop {
1154            if let Some(unprocessed) = self.unprocessed_starter.take() {
1155                debug_assert_eq!(undecomposed_starter, CharacterAndTrieValue::new('\u{0}', 0));
1156                debug_assert_eq!(starter, '\u{0}');
1157                starter = unprocessed;
1158            } else {
1159                debug_assert_eq!(self.decomposition.buffer_pos, 0);
1160                let next_starter = self.decomposition.decomposing_next(undecomposed_starter);
1161                if !attempt_composition {
1162                    starter = next_starter;
1163                } else if let Some(composed) = self.compose(starter, next_starter) {
1164                    starter = composed;
1165                } else {
1166                    // This is our yield point. We'll pick this up above in the
1167                    // next call to `next()`.
1168                    self.unprocessed_starter = Some(next_starter);
1169                    return Some(starter);
1170                }
1171            }
1172            // We first loop by index to avoid moving the contents of `buffer`, but
1173            // if there's a discontiguous match, we'll start modifying `buffer` instead.
1174            loop {
1175                let (character, ccc) = if let Some((character, ccc)) = self
1176                    .decomposition
1177                    .buffer
1178                    .get(self.decomposition.buffer_pos)
1179                    .map(|c| c.character_and_ccc())
1180                {
1181                    (character, ccc)
1182                } else {
1183                    self.decomposition.buffer.clear();
1184                    self.decomposition.buffer_pos = 0;
1185                    break;
1186                };
1187                if let Some(composed) = self.compose(starter, character) {
1188                    starter = composed;
1189                    self.decomposition.buffer_pos += 1;
1190                    continue;
1191                }
1192                let mut most_recent_skipped_ccc = ccc;
1193                {
1194                    let _ = self
1195                        .decomposition
1196                        .buffer
1197                        .drain(0..self.decomposition.buffer_pos);
1198                }
1199                self.decomposition.buffer_pos = 0;
1200                if most_recent_skipped_ccc == CCC_NOT_REORDERED {
1201                    // We failed to compose a starter. Discontiguous match not allowed.
1202                    // We leave the starter in `buffer` for `next()` to find.
1203                    return Some(starter);
1204                }
1205                let mut i = 1; // We have skipped one non-starter.
1206                while let Some((character, ccc)) = self
1207                    .decomposition
1208                    .buffer
1209                    .get(i)
1210                    .map(|c| c.character_and_ccc())
1211                {
1212                    if ccc == CCC_NOT_REORDERED {
1213                        // Discontiguous match not allowed.
1214                        return Some(starter);
1215                    }
1216                    debug_assert!(ccc >= most_recent_skipped_ccc);
1217                    if ccc != most_recent_skipped_ccc {
1218                        // Using the non-Hangul version as a micro-optimization, since
1219                        // we already rejected the case where `second` is a starter
1220                        // above, and conjoining jamo are starters.
1221                        if let Some(composed) = self.compose_non_hangul(starter, character) {
1222                            self.decomposition.buffer.remove(i);
1223                            starter = composed;
1224                            continue;
1225                        }
1226                    }
1227                    most_recent_skipped_ccc = ccc;
1228                    i += 1;
1229                }
1230                break;
1231            }
1232
1233            debug_assert_eq!(self.decomposition.buffer_pos, 0);
1234
1235            if !self.decomposition.buffer.is_empty() {
1236                return Some(starter);
1237            }
1238            // Now we need to check if composition with an upcoming starter is possible.
1239            if let Some(pending) = self.decomposition.pending.take() {
1240                // We know that `pending_starter` decomposes to start with a starter.
1241                // Otherwise, it would have been moved to `self.decomposition.buffer`
1242                // by `self.decomposing_next()`. We do this set lookup here in order
1243                // to get an opportunity to go back to the fast track.
1244                // Note that this check has to happen _after_ checking that `pending`
1245                // holds a character, because this flag isn't defined to be meaningful
1246                // when `pending` isn't holding a character.
1247                if u32::from(pending.character) < self.composition_passthrough_bound
1248                    || !pending.can_combine_backwards()
1249                {
1250                    // Won't combine backwards anyway.
1251                    self.decomposition.pending = Some(pending);
1252                    return Some(starter);
1253                }
1254                // Consume what we peeked.
1255                undecomposed_starter = pending;
1256                // The following line is OK, because we're about to loop back
1257                // to `self.decomposition.decomposing_next(c);`, which will
1258                // restore the between-`next()`-calls invariant of `pending`
1259                // before this function returns.
1260                attempt_composition = true;
1261                continue;
1262            }
1263            // End of input
1264            return Some(starter);
1265        }
1266    }
1267}
1268
1269macro_rules! composing_normalize_to {
1270    ($(#[$meta:meta])*,
1271     $normalize_to:ident,
1272     $write:path,
1273     $slice:ty,
1274     $prolog:block,
1275     $always_valid_utf:literal,
1276     $as_slice:ident,
1277     $fast:block,
1278     $text:ident,
1279     $sink:ident,
1280     $composition:ident,
1281     $composition_passthrough_bound:ident,
1282     $undecomposed_starter:ident,
1283     $pending_slice:ident,
1284     $len_utf:ident,
1285    ) => {
1286        $(#[$meta])*
1287        pub fn $normalize_to<W: $write + ?Sized>(
1288            &self,
1289            $text: $slice,
1290            $sink: &mut W,
1291        ) -> core::fmt::Result {
1292            $prolog
1293            let mut $composition = self.normalize_iter($text.chars());
1294            debug_assert_eq!($composition.decomposition.ignorable_behavior, IgnorableBehavior::Unsupported);
1295            for cc in $composition.decomposition.buffer.drain(..) {
1296                $sink.write_char(cc.character())?;
1297            }
1298
1299            // Try to get the compiler to hoist the bound to a register.
1300            let $composition_passthrough_bound = $composition.composition_passthrough_bound;
1301            'outer: loop {
1302                debug_assert_eq!($composition.decomposition.buffer_pos, 0);
1303                let mut $undecomposed_starter =
1304                    if let Some(pending) = $composition.decomposition.pending.take() {
1305                        pending
1306                    } else {
1307                        return Ok(());
1308                    };
1309                if u32::from($undecomposed_starter.character) < $composition_passthrough_bound ||
1310                    $undecomposed_starter.potential_passthrough()
1311                {
1312                    // We don't know if a `REPLACEMENT_CHARACTER` occurred in the slice or
1313                    // was returned in response to an error by the iterator. Assume the
1314                    // latter for correctness even though it pessimizes the former.
1315                    if $always_valid_utf || $undecomposed_starter.character != REPLACEMENT_CHARACTER {
1316                        let $pending_slice = &$text[$text.len() - $composition.decomposition.delegate.$as_slice().len() - $undecomposed_starter.character.$len_utf()..];
1317                        // The `$fast` block must either:
1318                        // 1. Return due to reaching EOF
1319                        // 2. Leave a starter with its trie value in `$undecomposed_starter`
1320                        //    and, if there is still more input, leave the next character
1321                        //    and its trie value in `$composition.decomposition.pending`.
1322                        $fast
1323                    }
1324                }
1325                // Fast track above, full algorithm below
1326                let mut starter = $composition
1327                    .decomposition
1328                    .decomposing_next($undecomposed_starter);
1329                'bufferloop: loop {
1330                    // We first loop by index to avoid moving the contents of `buffer`, but
1331                    // if there's a discontiguous match, we'll start modifying `buffer` instead.
1332                    loop {
1333                        let (character, ccc) = if let Some((character, ccc)) = $composition
1334                            .decomposition
1335                            .buffer
1336                            .get($composition.decomposition.buffer_pos)
1337                            .map(|c| c.character_and_ccc())
1338                        {
1339                            (character, ccc)
1340                        } else {
1341                            $composition.decomposition.buffer.clear();
1342                            $composition.decomposition.buffer_pos = 0;
1343                            break;
1344                        };
1345                        if let Some(composed) = $composition.compose(starter, character) {
1346                            starter = composed;
1347                            $composition.decomposition.buffer_pos += 1;
1348                            continue;
1349                        }
1350                        let mut most_recent_skipped_ccc = ccc;
1351                        if most_recent_skipped_ccc == CCC_NOT_REORDERED {
1352                            // We failed to compose a starter. Discontiguous match not allowed.
1353                            // Write the current `starter` we've been composing, make the unmatched
1354                            // starter in the buffer the new `starter` (we know it's been decomposed)
1355                            // and process the rest of the buffer with that as the starter.
1356                            $sink.write_char(starter)?;
1357                            starter = character;
1358                            $composition.decomposition.buffer_pos += 1;
1359                            continue 'bufferloop;
1360                        } else {
1361                            {
1362                                let _ = $composition
1363                                    .decomposition
1364                                    .buffer
1365                                    .drain(0..$composition.decomposition.buffer_pos);
1366                            }
1367                            $composition.decomposition.buffer_pos = 0;
1368                        }
1369                        let mut i = 1; // We have skipped one non-starter.
1370                        while let Some((character, ccc)) = $composition
1371                            .decomposition
1372                            .buffer
1373                            .get(i)
1374                            .map(|c| c.character_and_ccc())
1375                        {
1376                            if ccc == CCC_NOT_REORDERED {
1377                                // Discontiguous match not allowed.
1378                                $sink.write_char(starter)?;
1379                                for cc in $composition.decomposition.buffer.drain(..i) {
1380                                    $sink.write_char(cc.character())?;
1381                                }
1382                                starter = character;
1383                                {
1384                                    let removed = $composition.decomposition.buffer.remove(0);
1385                                    debug_assert_eq!(starter, removed.character());
1386                                }
1387                                debug_assert_eq!($composition.decomposition.buffer_pos, 0);
1388                                continue 'bufferloop;
1389                            }
1390                            debug_assert!(ccc >= most_recent_skipped_ccc);
1391                            if ccc != most_recent_skipped_ccc {
1392                                // Using the non-Hangul version as a micro-optimization, since
1393                                // we already rejected the case where `second` is a starter
1394                                // above, and conjoining jamo are starters.
1395                                if let Some(composed) =
1396                                    $composition.compose_non_hangul(starter, character)
1397                                {
1398                                    $composition.decomposition.buffer.remove(i);
1399                                    starter = composed;
1400                                    continue;
1401                                }
1402                            }
1403                            most_recent_skipped_ccc = ccc;
1404                            i += 1;
1405                        }
1406                        break;
1407                    }
1408                    debug_assert_eq!($composition.decomposition.buffer_pos, 0);
1409
1410                    if !$composition.decomposition.buffer.is_empty() {
1411                        $sink.write_char(starter)?;
1412                        for cc in $composition.decomposition.buffer.drain(..) {
1413                            $sink.write_char(cc.character())?;
1414                        }
1415                        // We had non-empty buffer, so can't compose with upcoming.
1416                        continue 'outer;
1417                    }
1418                    // Now we need to check if composition with an upcoming starter is possible.
1419                    if $composition.decomposition.pending.is_some() {
1420                        // We know that `pending_starter` decomposes to start with a starter.
1421                        // Otherwise, it would have been moved to `composition.decomposition.buffer`
1422                        // by `composition.decomposing_next()`. We do this set lookup here in order
1423                        // to get an opportunity to go back to the fast track.
1424                        // Note that this check has to happen _after_ checking that `pending`
1425                        // holds a character, because this flag isn't defined to be meaningful
1426                        // when `pending` isn't holding a character.
1427                        let pending = $composition.decomposition.pending.as_ref().unwrap();
1428                        if u32::from(pending.character) < $composition.composition_passthrough_bound
1429                            || !pending.can_combine_backwards()
1430                        {
1431                            // Won't combine backwards anyway.
1432                            $sink.write_char(starter)?;
1433                            continue 'outer;
1434                        }
1435                        let pending_starter = $composition.decomposition.pending.take().unwrap();
1436                        let decomposed = $composition.decomposition.decomposing_next(pending_starter);
1437                        if let Some(composed) = $composition.compose(starter, decomposed) {
1438                            starter = composed;
1439                        } else {
1440                            $sink.write_char(starter)?;
1441                            starter = decomposed;
1442                        }
1443                        continue 'bufferloop;
1444                    }
1445                    // End of input
1446                    $sink.write_char(starter)?;
1447                    return Ok(());
1448                } // 'bufferloop
1449            }
1450        }
1451    };
1452}
1453
1454macro_rules! decomposing_normalize_to {
1455    ($(#[$meta:meta])*,
1456     $normalize_to:ident,
1457     $write:path,
1458     $slice:ty,
1459     $prolog:block,
1460     $as_slice:ident,
1461     $fast:block,
1462     $text:ident,
1463     $sink:ident,
1464     $decomposition:ident,
1465     $decomposition_passthrough_bound:ident,
1466     $undecomposed_starter:ident,
1467     $pending_slice:ident,
1468     $outer:lifetime, // loop labels use lifetime tokens
1469    ) => {
1470        $(#[$meta])*
1471        pub fn $normalize_to<W: $write + ?Sized>(
1472            &self,
1473            $text: $slice,
1474            $sink: &mut W,
1475        ) -> core::fmt::Result {
1476            $prolog
1477
1478            let mut $decomposition = self.normalize_iter($text.chars());
1479            debug_assert_eq!($decomposition.ignorable_behavior, IgnorableBehavior::Unsupported);
1480
1481            // Try to get the compiler to hoist the bound to a register.
1482            let $decomposition_passthrough_bound = $decomposition.decomposition_passthrough_bound;
1483            $outer: loop {
1484                for cc in $decomposition.buffer.drain(..) {
1485                    $sink.write_char(cc.character())?;
1486                }
1487                debug_assert_eq!($decomposition.buffer_pos, 0);
1488                let mut $undecomposed_starter = if let Some(pending) = $decomposition.pending.take() {
1489                    pending
1490                } else {
1491                    return Ok(());
1492                };
1493                if $undecomposed_starter.starter_and_decomposes_to_self() {
1494                    // Don't bother including `undecomposed_starter` in a contiguous buffer
1495                    // write: Just write it right away:
1496                    $sink.write_char($undecomposed_starter.character)?;
1497
1498                    let $pending_slice = $decomposition.delegate.$as_slice();
1499                    $fast
1500                }
1501                let starter = $decomposition.decomposing_next($undecomposed_starter);
1502                $sink.write_char(starter)?;
1503            }
1504        }
1505    };
1506}
1507
1508macro_rules! normalizer_methods {
1509    () => {
1510        /// Normalize a string slice into a `Cow<'a, str>`.
1511        pub fn normalize<'a>(&self, text: &'a str) -> Cow<'a, str> {
1512            let (head, tail) = self.split_normalized(text);
1513            if tail.is_empty() {
1514                return Cow::Borrowed(head);
1515            }
1516            let mut ret = String::new();
1517            ret.reserve(text.len());
1518            ret.push_str(head);
1519            let _ = self.normalize_to(tail, &mut ret);
1520            Cow::Owned(ret)
1521        }
1522
1523        /// Split a string slice into maximum normalized prefix and unnormalized suffix
1524        /// such that the concatenation of the prefix and the normalization of the suffix
1525        /// is the normalization of the whole input.
1526        pub fn split_normalized<'a>(&self, text: &'a str) -> (&'a str, &'a str) {
1527            let up_to = self.is_normalized_up_to(text);
1528            text.split_at_checked(up_to).unwrap_or_else(|| {
1529                // Internal bug, not even GIGO, never supposed to happen
1530                debug_assert!(false);
1531                ("", text)
1532            })
1533        }
1534
1535        /// Return the index a string slice is normalized up to.
1536        fn is_normalized_up_to(&self, text: &str) -> usize {
1537            let mut sink = IsNormalizedSinkStr::new(text);
1538            let _ = self.normalize_to(text, &mut sink);
1539            text.len() - sink.remaining_len()
1540        }
1541
1542        /// Check whether a string slice is normalized.
1543        pub fn is_normalized(&self, text: &str) -> bool {
1544            self.is_normalized_up_to(text) == text.len()
1545        }
1546
1547        /// Normalize a slice of potentially-invalid UTF-16 into a `Cow<'a, [u16]>`.
1548        ///
1549        /// Unpaired surrogates are mapped to the REPLACEMENT CHARACTER
1550        /// before normalizing.
1551        ///
1552        /// ✨ *Enabled with the `utf16_iter` Cargo feature.*
1553        #[cfg(feature = "utf16_iter")]
1554        pub fn normalize_utf16<'a>(&self, text: &'a [u16]) -> Cow<'a, [u16]> {
1555            let (head, tail) = self.split_normalized_utf16(text);
1556            if tail.is_empty() {
1557                return Cow::Borrowed(head);
1558            }
1559            let mut ret = alloc::vec::Vec::with_capacity(text.len());
1560            ret.extend_from_slice(head);
1561            let _ = self.normalize_utf16_to(tail, &mut ret);
1562            Cow::Owned(ret)
1563        }
1564
1565        /// Split a slice of potentially-invalid UTF-16 into maximum normalized (and valid)
1566        /// prefix and unnormalized suffix such that the concatenation of the prefix and the
1567        /// normalization of the suffix is the normalization of the whole input.
1568        ///
1569        /// ✨ *Enabled with the `utf16_iter` Cargo feature.*
1570        #[cfg(feature = "utf16_iter")]
1571        pub fn split_normalized_utf16<'a>(&self, text: &'a [u16]) -> (&'a [u16], &'a [u16]) {
1572            let up_to = self.is_normalized_utf16_up_to(text);
1573            text.split_at_checked(up_to).unwrap_or_else(|| {
1574                // Internal bug, not even GIGO, never supposed to happen
1575                debug_assert!(false);
1576                (&[], text)
1577            })
1578        }
1579
1580        /// Return the index a slice of potentially-invalid UTF-16 is normalized up to.
1581        ///
1582        /// ✨ *Enabled with the `utf16_iter` Cargo feature.*
1583        #[cfg(feature = "utf16_iter")]
1584        fn is_normalized_utf16_up_to(&self, text: &[u16]) -> usize {
1585            let mut sink = IsNormalizedSinkUtf16::new(text);
1586            let _ = self.normalize_utf16_to(text, &mut sink);
1587            text.len() - sink.remaining_len()
1588        }
1589
1590        /// Checks whether a slice of potentially-invalid UTF-16 is normalized.
1591        ///
1592        /// Unpaired surrogates are treated as the REPLACEMENT CHARACTER.
1593        ///
1594        /// ✨ *Enabled with the `utf16_iter` Cargo feature.*
1595        #[cfg(feature = "utf16_iter")]
1596        pub fn is_normalized_utf16(&self, text: &[u16]) -> bool {
1597            self.is_normalized_utf16_up_to(text) == text.len()
1598        }
1599
1600        /// Normalize a slice of potentially-invalid UTF-8 into a `Cow<'a, str>`.
1601        ///
1602        /// Ill-formed byte sequences are mapped to the REPLACEMENT CHARACTER
1603        /// according to the WHATWG Encoding Standard.
1604        ///
1605        /// ✨ *Enabled with the `utf8_iter` Cargo feature.*
1606        #[cfg(feature = "utf8_iter")]
1607        pub fn normalize_utf8<'a>(&self, text: &'a [u8]) -> Cow<'a, str> {
1608            let (head, tail) = self.split_normalized_utf8(text);
1609            if tail.is_empty() {
1610                return Cow::Borrowed(head);
1611            }
1612            let mut ret = String::new();
1613            ret.reserve(text.len());
1614            ret.push_str(head);
1615            let _ = self.normalize_utf8_to(tail, &mut ret);
1616            Cow::Owned(ret)
1617        }
1618
1619        /// Split a slice of potentially-invalid UTF-8 into maximum normalized (and valid)
1620        /// prefix and unnormalized suffix such that the concatenation of the prefix and the
1621        /// normalization of the suffix is the normalization of the whole input.
1622        ///
1623        /// ✨ *Enabled with the `utf8_iter` Cargo feature.*
1624        #[cfg(feature = "utf8_iter")]
1625        pub fn split_normalized_utf8<'a>(&self, text: &'a [u8]) -> (&'a str, &'a [u8]) {
1626            let up_to = self.is_normalized_utf8_up_to(text);
1627            let (head, tail) = text.split_at_checked(up_to).unwrap_or_else(|| {
1628                // Internal bug, not even GIGO, never supposed to happen
1629                debug_assert!(false);
1630                (&[], text)
1631            });
1632            // SAFETY: The normalization check also checks for
1633            // UTF-8 well-formedness.
1634            (unsafe { core::str::from_utf8_unchecked(head) }, tail)
1635        }
1636
1637        /// Return the index a slice of potentially-invalid UTF-8 is normalized up to
1638        ///
1639        /// ✨ *Enabled with the `utf8_iter` Cargo feature.*
1640        #[cfg(feature = "utf8_iter")]
1641        fn is_normalized_utf8_up_to(&self, text: &[u8]) -> usize {
1642            let mut sink = IsNormalizedSinkUtf8::new(text);
1643            let _ = self.normalize_utf8_to(text, &mut sink);
1644            text.len() - sink.remaining_len()
1645        }
1646
1647        /// Check if a slice of potentially-invalid UTF-8 is normalized.
1648        ///
1649        /// Ill-formed byte sequences are mapped to the REPLACEMENT CHARACTER
1650        /// according to the WHATWG Encoding Standard before checking.
1651        ///
1652        /// ✨ *Enabled with the `utf8_iter` Cargo feature.*
1653        #[cfg(feature = "utf8_iter")]
1654        pub fn is_normalized_utf8(&self, text: &[u8]) -> bool {
1655            self.is_normalized_utf8_up_to(text) == text.len()
1656        }
1657    };
1658}
1659
1660/// Borrowed version of a normalizer for performing decomposing normalization.
1661#[derive(Debug)]
1662pub struct DecomposingNormalizerBorrowed<'a> {
1663    decompositions: &'a DecompositionData<'a>,
1664    tables: &'a DecompositionTables<'a>,
1665    supplementary_tables: Option<&'a DecompositionTables<'a>>,
1666    decomposition_passthrough_bound: u8, // never above 0xC0
1667    composition_passthrough_bound: u16,  // never above 0x0300
1668}
1669
1670impl DecomposingNormalizerBorrowed<'static> {
1671    /// Cheaply converts a [`DecomposingNormalizerBorrowed<'static>`] into a [`DecomposingNormalizer`].
1672    ///
1673    /// Note: Due to branching and indirection, using [`DecomposingNormalizer`] might inhibit some
1674    /// compile-time optimizations that are possible with [`DecomposingNormalizerBorrowed`].
1675    pub const fn static_to_owned(self) -> DecomposingNormalizer {
1676        DecomposingNormalizer {
1677            decompositions: DataPayload::from_static_ref(self.decompositions),
1678            tables: DataPayload::from_static_ref(self.tables),
1679            supplementary_tables: if let Some(s) = self.supplementary_tables {
1680                // `map` not available in const context
1681                Some(DataPayload::from_static_ref(s))
1682            } else {
1683                None
1684            },
1685            decomposition_passthrough_bound: self.decomposition_passthrough_bound,
1686            composition_passthrough_bound: self.composition_passthrough_bound,
1687        }
1688    }
1689
1690    /// NFD constructor using compiled data.
1691    ///
1692    /// ✨ *Enabled with the `compiled_data` Cargo feature.*
1693    ///
1694    /// [📚 Help choosing a constructor](icu_provider::constructors)
1695    #[cfg(feature = "compiled_data")]
1696    pub const fn new_nfd() -> Self {
1697        const _: () = assert!(
1698            provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1
1699                .scalars16
1700                .const_len()
1701                + provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1
1702                    .scalars24
1703                    .const_len()
1704                <= 0xFFF,
1705            "future extension"
1706        );
1707
1708        DecomposingNormalizerBorrowed {
1709            decompositions: provider::Baked::SINGLETON_NORMALIZER_NFD_DATA_V1,
1710            tables: provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1,
1711            supplementary_tables: None,
1712            decomposition_passthrough_bound: 0xC0,
1713            composition_passthrough_bound: 0x0300,
1714        }
1715    }
1716
1717    /// NFKD constructor using compiled data.
1718    ///
1719    /// ✨ *Enabled with the `compiled_data` Cargo feature.*
1720    ///
1721    /// [📚 Help choosing a constructor](icu_provider::constructors)
1722    #[cfg(feature = "compiled_data")]
1723    pub const fn new_nfkd() -> Self {
1724        const _: () = assert!(
1725            provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1
1726                .scalars16
1727                .const_len()
1728                + provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1
1729                    .scalars24
1730                    .const_len()
1731                + provider::Baked::SINGLETON_NORMALIZER_NFKD_TABLES_V1
1732                    .scalars16
1733                    .const_len()
1734                + provider::Baked::SINGLETON_NORMALIZER_NFKD_TABLES_V1
1735                    .scalars24
1736                    .const_len()
1737                <= 0xFFF,
1738            "future extension"
1739        );
1740
1741        const _: () = assert!(
1742            provider::Baked::SINGLETON_NORMALIZER_NFKD_DATA_V1.passthrough_cap <= 0x0300,
1743            "invalid"
1744        );
1745
1746        let decomposition_capped =
1747            if provider::Baked::SINGLETON_NORMALIZER_NFKD_DATA_V1.passthrough_cap < 0xC0 {
1748                provider::Baked::SINGLETON_NORMALIZER_NFKD_DATA_V1.passthrough_cap
1749            } else {
1750                0xC0
1751            };
1752        let composition_capped =
1753            if provider::Baked::SINGLETON_NORMALIZER_NFKD_DATA_V1.passthrough_cap < 0x0300 {
1754                provider::Baked::SINGLETON_NORMALIZER_NFKD_DATA_V1.passthrough_cap
1755            } else {
1756                0x0300
1757            };
1758
1759        DecomposingNormalizerBorrowed {
1760            decompositions: provider::Baked::SINGLETON_NORMALIZER_NFKD_DATA_V1,
1761            tables: provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1,
1762            supplementary_tables: Some(provider::Baked::SINGLETON_NORMALIZER_NFKD_TABLES_V1),
1763            decomposition_passthrough_bound: decomposition_capped as u8,
1764            composition_passthrough_bound: composition_capped,
1765        }
1766    }
1767
1768    #[cfg(feature = "compiled_data")]
1769    pub(crate) const fn new_uts46_decomposed() -> Self {
1770        const _: () = assert!(
1771            provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1
1772                .scalars16
1773                .const_len()
1774                + provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1
1775                    .scalars24
1776                    .const_len()
1777                + provider::Baked::SINGLETON_NORMALIZER_NFKD_TABLES_V1
1778                    .scalars16
1779                    .const_len()
1780                + provider::Baked::SINGLETON_NORMALIZER_NFKD_TABLES_V1
1781                    .scalars24
1782                    .const_len()
1783                <= 0xFFF,
1784            "future extension"
1785        );
1786
1787        const _: () = assert!(
1788            provider::Baked::SINGLETON_NORMALIZER_UTS46_DATA_V1.passthrough_cap <= 0x0300,
1789            "invalid"
1790        );
1791
1792        let decomposition_capped =
1793            if provider::Baked::SINGLETON_NORMALIZER_UTS46_DATA_V1.passthrough_cap < 0xC0 {
1794                provider::Baked::SINGLETON_NORMALIZER_UTS46_DATA_V1.passthrough_cap
1795            } else {
1796                0xC0
1797            };
1798        let composition_capped =
1799            if provider::Baked::SINGLETON_NORMALIZER_UTS46_DATA_V1.passthrough_cap < 0x0300 {
1800                provider::Baked::SINGLETON_NORMALIZER_UTS46_DATA_V1.passthrough_cap
1801            } else {
1802                0x0300
1803            };
1804
1805        DecomposingNormalizerBorrowed {
1806            decompositions: provider::Baked::SINGLETON_NORMALIZER_UTS46_DATA_V1,
1807            tables: provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1,
1808            supplementary_tables: Some(provider::Baked::SINGLETON_NORMALIZER_NFKD_TABLES_V1),
1809            decomposition_passthrough_bound: decomposition_capped as u8,
1810            composition_passthrough_bound: composition_capped,
1811        }
1812    }
1813}
1814
1815impl<'data> DecomposingNormalizerBorrowed<'data> {
1816    /// NFD constructor using already-loaded data.
1817    ///
1818    /// This constructor is intended for use by collations.
1819    ///
1820    /// [📚 Help choosing a constructor](icu_provider::constructors)
1821    #[doc(hidden)]
1822    pub fn new_with_data(
1823        decompositions: &'data DecompositionData<'data>,
1824        tables: &'data DecompositionTables<'data>,
1825    ) -> Self {
1826        Self {
1827            decompositions,
1828            tables,
1829            supplementary_tables: None,
1830            decomposition_passthrough_bound: 0xC0,
1831            composition_passthrough_bound: 0x0300,
1832        }
1833    }
1834
1835    /// Wraps a delegate iterator into a decomposing iterator
1836    /// adapter by using the data already held by this normalizer.
1837    pub fn normalize_iter<I: Iterator<Item = char>>(&self, iter: I) -> Decomposition<'data, I> {
1838        Decomposition::new_with_supplements(
1839            iter,
1840            self.decompositions,
1841            self.tables,
1842            self.supplementary_tables,
1843            self.decomposition_passthrough_bound,
1844            IgnorableBehavior::Unsupported,
1845        )
1846    }
1847
1848    normalizer_methods!();
1849
1850    decomposing_normalize_to!(
1851        /// Normalize a string slice into a `Write` sink.
1852        ,
1853        normalize_to,
1854        core::fmt::Write,
1855        &str,
1856        {
1857        },
1858        as_str,
1859        {
1860            let decomposition_passthrough_byte_bound = if decomposition_passthrough_bound == 0xC0 {
1861                0xC3u8
1862            } else {
1863                decomposition_passthrough_bound.min(0x80) as u8
1864            };
1865            // The attribute belongs on an inner statement, but Rust doesn't allow it there.
1866            #[expect(clippy::unwrap_used)]
1867            'fast: loop {
1868                let mut code_unit_iter = decomposition.delegate.as_str().as_bytes().iter();
1869                'fastest: loop {
1870                    if let Some(&upcoming_byte) = code_unit_iter.next() {
1871                        if upcoming_byte < decomposition_passthrough_byte_bound {
1872                            // Fast-track succeeded!
1873                            continue 'fastest;
1874                        }
1875                        // This deliberately isn't panic-free, since the code pattern
1876                        // that was OK for the composing counterpart regressed
1877                        // English and French performance if done here, too.
1878                        decomposition.delegate = pending_slice[pending_slice.len() - code_unit_iter.as_slice().len() - 1..].chars();
1879                        break 'fastest;
1880                    }
1881                    // End of stream
1882                    sink.write_str(pending_slice)?;
1883                    return Ok(());
1884                }
1885
1886                // `unwrap()` OK, because the slice is valid UTF-8 and we know there
1887                // is an upcoming byte.
1888                let upcoming = decomposition.delegate.next().unwrap();
1889                let upcoming_with_trie_value = decomposition.attach_trie_value(upcoming);
1890                if upcoming_with_trie_value.starter_and_decomposes_to_self() {
1891                    continue 'fast;
1892                }
1893                let consumed_so_far_slice = &pending_slice[..pending_slice.len()
1894                    - decomposition.delegate.as_str().len()
1895                    - upcoming.len_utf8()];
1896                sink.write_str(consumed_so_far_slice)?;
1897
1898                // Now let's figure out if we got a starter or a non-starter.
1899                if decomposition_starts_with_non_starter(
1900                    upcoming_with_trie_value.trie_val,
1901                ) {
1902                    // Let this trie value to be reprocessed in case it is
1903                    // one of the rare decomposing ones.
1904                    decomposition.pending = Some(upcoming_with_trie_value);
1905                    decomposition.gather_and_sort_combining(0);
1906                    continue 'outer;
1907                }
1908                undecomposed_starter = upcoming_with_trie_value;
1909                debug_assert!(decomposition.pending.is_none());
1910                break 'fast;
1911            }
1912        },
1913        text,
1914        sink,
1915        decomposition,
1916        decomposition_passthrough_bound,
1917        undecomposed_starter,
1918        pending_slice,
1919        'outer,
1920    );
1921
1922    decomposing_normalize_to!(
1923        /// Normalize a slice of potentially-invalid UTF-8 into a `Write` sink.
1924        ///
1925        /// Ill-formed byte sequences are mapped to the REPLACEMENT CHARACTER
1926        /// according to the WHATWG Encoding Standard.
1927        ///
1928        /// ✨ *Enabled with the `utf8_iter` Cargo feature.*
1929        #[cfg(feature = "utf8_iter")]
1930        ,
1931        normalize_utf8_to,
1932        core::fmt::Write,
1933        &[u8],
1934        {
1935        },
1936        as_slice,
1937        {
1938            let decomposition_passthrough_byte_bound = decomposition_passthrough_bound.min(0x80) as u8;
1939            'fast: loop {
1940                let mut code_unit_iter = decomposition.delegate.as_slice().iter();
1941                'fastest: loop {
1942                    if let Some(&upcoming_byte) = code_unit_iter.next() {
1943                        if upcoming_byte < decomposition_passthrough_byte_bound {
1944                            // Fast-track succeeded!
1945                            continue 'fastest;
1946                        }
1947                        break 'fastest;
1948                    }
1949                    // End of stream
1950                    sink.write_str(unsafe { core::str::from_utf8_unchecked(pending_slice) })?;
1951                    return Ok(());
1952                }
1953                #[expect(clippy::indexing_slicing)]
1954                {decomposition.delegate = pending_slice[pending_slice.len() - code_unit_iter.as_slice().len() - 1..].chars();}
1955
1956                // `unwrap()` OK, because the slice is valid UTF-8 and we know there
1957                // is an upcoming byte.
1958                #[expect(clippy::unwrap_used)]
1959                let upcoming = decomposition.delegate.next().unwrap();
1960                let upcoming_with_trie_value = decomposition.attach_trie_value(upcoming);
1961                if upcoming_with_trie_value.starter_and_decomposes_to_self_except_replacement() {
1962                    // Note: The trie value of the REPLACEMENT CHARACTER is
1963                    // intentionally formatted to fail the
1964                    // `starter_and_decomposes_to_self` test even though it
1965                    // really is a starter that decomposes to self. This
1966                    // Allows moving the branch on REPLACEMENT CHARACTER
1967                    // below this `continue`.
1968                    continue 'fast;
1969                }
1970
1971                // TODO: Annotate as unlikely.
1972                if upcoming == REPLACEMENT_CHARACTER {
1973                    // We might have an error, so fall out of the fast path.
1974
1975                    // Since the U+FFFD might signify an error, we can't
1976                    // assume `upcoming.len_utf8()` for the backoff length.
1977                    #[expect(clippy::indexing_slicing)]
1978                    let mut consumed_so_far = pending_slice[..pending_slice.len() - decomposition.delegate.as_slice().len()].chars();
1979                    let back = consumed_so_far.next_back();
1980                    debug_assert_eq!(back, Some(REPLACEMENT_CHARACTER));
1981                    let consumed_so_far_slice = consumed_so_far.as_slice();
1982                    sink.write_str(unsafe { core::str::from_utf8_unchecked(consumed_so_far_slice) } )?;
1983
1984                    // We could call `gather_and_sort_combining` here and
1985                    // `continue 'outer`, but this should be better for code
1986                    // size.
1987                    undecomposed_starter = upcoming_with_trie_value;
1988                    debug_assert!(decomposition.pending.is_none());
1989                    break 'fast;
1990                }
1991
1992                #[expect(clippy::indexing_slicing)]
1993                let consumed_so_far_slice = &pending_slice[..pending_slice.len()
1994                    - decomposition.delegate.as_slice().len()
1995                    - upcoming.len_utf8()];
1996                sink.write_str(unsafe { core::str::from_utf8_unchecked(consumed_so_far_slice) } )?;
1997
1998                // Now let's figure out if we got a starter or a non-starter.
1999                if decomposition_starts_with_non_starter(
2000                    upcoming_with_trie_value.trie_val,
2001                ) {
2002                    // Let this trie value to be reprocessed in case it is
2003                    // one of the rare decomposing ones.
2004                    decomposition.pending = Some(upcoming_with_trie_value);
2005                    decomposition.gather_and_sort_combining(0);
2006                    continue 'outer;
2007                }
2008                undecomposed_starter = upcoming_with_trie_value;
2009                debug_assert!(decomposition.pending.is_none());
2010                break 'fast;
2011            }
2012        },
2013        text,
2014        sink,
2015        decomposition,
2016        decomposition_passthrough_bound,
2017        undecomposed_starter,
2018        pending_slice,
2019        'outer,
2020    );
2021
2022    decomposing_normalize_to!(
2023        /// Normalize a slice of potentially-invalid UTF-16 into a `Write16` sink.
2024        ///
2025        /// Unpaired surrogates are mapped to the REPLACEMENT CHARACTER
2026        /// before normalizing.
2027        ///
2028        /// ✨ *Enabled with the `utf16_iter` Cargo feature.*
2029        #[cfg(feature = "utf16_iter")]
2030        ,
2031        normalize_utf16_to,
2032        write16::Write16,
2033        &[u16],
2034        {
2035            sink.size_hint(text.len())?;
2036        },
2037        as_slice,
2038        {
2039            // This loop is only broken out of as goto forward and only as release-build recovery from
2040            // detecting an internal bug without panic. (In debug builds, internal bugs panic instead.)
2041            #[expect(clippy::never_loop)]
2042            'fastwrap: loop {
2043                // Commented out `code_unit_iter` and used `ptr` and `end` to
2044                // work around https://github.com/rust-lang/rust/issues/144684 .
2045                //
2046                // let mut code_unit_iter = decomposition.delegate.as_slice().iter();
2047                let delegate_as_slice = decomposition.delegate.as_slice();
2048                let mut ptr: *const u16 = delegate_as_slice.as_ptr();
2049                // SAFETY: materializing a pointer immediately past the end of an
2050                // allocation is OK.
2051                let end: *const u16 = unsafe { ptr.add(delegate_as_slice.len()) };
2052                'fast: loop {
2053                    // if let Some(&upcoming_code_unit) = code_unit_iter.next() {
2054                    if ptr != end {
2055                        // SAFETY: We just checked that `ptr` has not reached `end`.
2056                        // `ptr` always advances by one, and we always have a check
2057                        // per advancement.
2058                        let upcoming_code_unit = unsafe { *ptr };
2059                        // SAFETY: Since `ptr` hadn't reached `end`, yet, advancing
2060                        // by one points to the same allocation or to immediately
2061                        // after, which is OK.
2062                        ptr = unsafe { ptr.add(1) };
2063
2064                        let mut upcoming32 = u32::from(upcoming_code_unit);
2065                        // The performance of what logically is supposed to be this
2066                        // branch is _incredibly_ brittle and what LLVM ends up doing
2067                        // that affects the performance of what's logically about this
2068                        // decision can swing to double/halve the throughput for Basic
2069                        // Latin in ways that are completely unintuitive. Basically _any_
2070                        // change to _any_ code that participates in how LLVM sees the
2071                        // code around here can make the perf fall over. In seems that
2072                        // manually annotating this branch as likely has worse effects
2073                        // on non-Basic-Latin input that the case where LLVM just happens to
2074                        // do the right thing.
2075                        //
2076                        // What happens with this branch may depend on what sink type
2077                        // this code is monomorphized over.
2078                        //
2079                        // What a terrible sink of developer time!
2080                        if upcoming32 < decomposition_passthrough_bound {
2081                            continue 'fast;
2082                        }
2083                        // We might be doing a trie lookup by surrogate. Surrogates get
2084                        // a decomposition to U+FFFD.
2085                        let mut trie_value = decomposition.trie.get16(upcoming_code_unit);
2086                        if starter_and_decomposes_to_self_impl(trie_value) {
2087                            continue 'fast;
2088                        }
2089                        // We might now be looking at a surrogate.
2090                        // The loop is only broken out of as goto forward
2091                        #[expect(clippy::never_loop)]
2092                        'surrogateloop: loop {
2093                            // LLVM's optimizations are incredibly brittle for the code _above_,
2094                            // and using `likely` _below_ without using it _above_ helps!
2095                            // What a massive sink of developer time!
2096                            // Seriously, the effect of these annotations is massively
2097                            // unintuitive. Measure everything!
2098                            // Notably, the `if likely(...)` formulation optimizes differently
2099                            // than just putting `cold_path()` on the `else` path!
2100                            let surrogate_base = upcoming32.wrapping_sub(0xD800);
2101                            if likely(surrogate_base > (0xDFFF - 0xD800)) {
2102                                // Not surrogate
2103                                break 'surrogateloop;
2104                            }
2105                            if likely(surrogate_base <= (0xDBFF - 0xD800)) {
2106                                // let iter_backup = code_unit_iter.clone();
2107                                // if let Some(&low) = code_unit_iter.next() {
2108                                if ptr != end {
2109                                    // SAFETY: We just checked that `ptr` has not reached `end`.
2110                                    // `ptr` always advances by one, and we always have a check
2111                                    // per advancement.
2112                                    let low = unsafe { *ptr };
2113                                    if likely(in_inclusive_range16(low, 0xDC00, 0xDFFF)) {
2114                                        // SAFETY: Since `ptr` hadn't reached `end`, yet, advancing
2115                                        // by one points to the same allocation or to immediately
2116                                        // after, which is OK.
2117                                        ptr = unsafe { ptr.add(1) };
2118
2119                                        upcoming32 = (upcoming32 << 10) + u32::from(low)
2120                                            - (((0xD800u32 << 10) - 0x10000u32) + 0xDC00u32);
2121                                        // Successfully-paired surrogate. Read from the trie again.
2122                                        trie_value = {
2123                                            // Semantically, this bit of conditional compilation makes no sense.
2124                                            // The purpose is to keep LLVM seeing the untyped trie case the way
2125                                            // it did before so as not to regress the performance of the untyped
2126                                            // case due to unintuitive optimizer effects. If you care about the
2127                                            // perf of the untyped trie case and have better ideas, please try
2128                                            // something better.
2129                                            #[cfg(not(icu4x_unstable_fast_trie_only))]
2130                                            {decomposition.trie.get32(upcoming32)}
2131                                            #[cfg(icu4x_unstable_fast_trie_only)]
2132                                            {decomposition.trie.get32_supplementary(upcoming32)}
2133                                        };
2134                                        if likely(starter_and_decomposes_to_self_impl(trie_value)) {
2135                                            continue 'fast;
2136                                        }
2137                                        break 'surrogateloop;
2138                                    // } else {
2139                                    //     code_unit_iter = iter_backup;
2140                                    }
2141                                }
2142                            }
2143                            // unpaired surrogate
2144                            upcoming32 = 0xFFFD; // Safe value for `char::from_u32_unchecked` and matches later potential error check.
2145                            // trie_value already holds a decomposition to U+FFFD.
2146                            break 'surrogateloop;
2147                        }
2148
2149                        let upcoming = unsafe { char::from_u32_unchecked(upcoming32) };
2150                        let upcoming_with_trie_value = CharacterAndTrieValue::new(upcoming, trie_value);
2151
2152
2153                        let Some(consumed_so_far_slice) = pending_slice.get(..pending_slice.len() -
2154                            // code_unit_iter.as_slice().len()
2155                            // SAFETY: `ptr` and `end` have been derived from the same allocation
2156                            // and `ptr` is never greater than `end`.
2157                            unsafe { end.offset_from(ptr) as usize }
2158                            - upcoming.len_utf16()) else {
2159                            // If we ever come here, it's a bug, but let's avoid panic code paths in release builds.
2160                            debug_assert!(false);
2161                            // Throw away the results of the fast path.
2162                            break 'fastwrap;
2163                        };
2164                        sink.write_slice(consumed_so_far_slice)?;
2165
2166                        if decomposition_starts_with_non_starter(
2167                            upcoming_with_trie_value.trie_val,
2168                        ) {
2169                            // Sync with main iterator
2170                            // decomposition.delegate = code_unit_iter.as_slice().chars();
2171                            // SAFETY: `ptr` and `end` have been derived from the same allocation
2172                            // and `ptr` is never greater than `end`.
2173                            decomposition.delegate = unsafe { core::slice::from_raw_parts(ptr, end.offset_from(ptr) as usize) }.chars();
2174                            // Let this trie value to be reprocessed in case it is
2175                            // one of the rare decomposing ones.
2176                            decomposition.pending = Some(upcoming_with_trie_value);
2177                            decomposition.gather_and_sort_combining(0);
2178                            continue 'outer;
2179                        }
2180                        undecomposed_starter = upcoming_with_trie_value;
2181                        debug_assert!(decomposition.pending.is_none());
2182                        break 'fast;
2183                    }
2184                    // End of stream
2185                    sink.write_slice(pending_slice)?;
2186                    return Ok(());
2187                }
2188                // Sync the main iterator
2189                // decomposition.delegate = code_unit_iter.as_slice().chars();
2190                // SAFETY: `ptr` and `end` have been derived from the same allocation
2191                // and `ptr` is never greater than `end`.
2192                decomposition.delegate = unsafe { core::slice::from_raw_parts(ptr, end.offset_from(ptr) as usize) }.chars();
2193                break 'fastwrap;
2194            }
2195        },
2196        text,
2197        sink,
2198        decomposition,
2199        decomposition_passthrough_bound,
2200        undecomposed_starter,
2201        pending_slice,
2202        'outer,
2203    );
2204}
2205
2206/// A normalizer for performing decomposing normalization.
2207#[derive(Debug)]
2208pub struct DecomposingNormalizer {
2209    decompositions: DataPayload<NormalizerNfdDataV1>,
2210    tables: DataPayload<NormalizerNfdTablesV1>,
2211    supplementary_tables: Option<DataPayload<NormalizerNfkdTablesV1>>,
2212    decomposition_passthrough_bound: u8, // never above 0xC0
2213    composition_passthrough_bound: u16,  // never above 0x0300
2214}
2215
2216impl DecomposingNormalizer {
2217    /// Constructs a borrowed version of this type for more efficient querying.
2218    pub fn as_borrowed(&self) -> DecomposingNormalizerBorrowed<'_> {
2219        DecomposingNormalizerBorrowed {
2220            decompositions: self.decompositions.get(),
2221            tables: self.tables.get(),
2222            supplementary_tables: self.supplementary_tables.as_ref().map(|s| s.get()),
2223            decomposition_passthrough_bound: self.decomposition_passthrough_bound,
2224            composition_passthrough_bound: self.composition_passthrough_bound,
2225        }
2226    }
2227
2228    /// NFD constructor using compiled data.
2229    ///
2230    /// ✨ *Enabled with the `compiled_data` Cargo feature.*
2231    ///
2232    /// [📚 Help choosing a constructor](icu_provider::constructors)
2233    #[cfg(feature = "compiled_data")]
2234    pub const fn new_nfd() -> DecomposingNormalizerBorrowed<'static> {
2235        DecomposingNormalizerBorrowed::new_nfd()
2236    }
2237
2238    icu_provider::gen_buffer_data_constructors!(
2239        () -> error: DataError,
2240        functions: [
2241            new_nfd: skip,
2242            try_new_nfd_with_buffer_provider,
2243            try_new_nfd_unstable,
2244            Self,
2245        ]
2246    );
2247
2248    #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_nfd)]
2249    pub fn try_new_nfd_unstable<D>(provider: &D) -> Result<Self, DataError>
2250    where
2251        D: DataProvider<NormalizerNfdDataV1> + DataProvider<NormalizerNfdTablesV1> + ?Sized,
2252    {
2253        let decompositions: DataPayload<NormalizerNfdDataV1> =
2254            provider.load(Default::default())?.payload;
2255        let tables: DataPayload<NormalizerNfdTablesV1> = provider.load(Default::default())?.payload;
2256
2257        if tables.get().scalars16.len() + tables.get().scalars24.len() > 0xFFF {
2258            // The data is from a future where there exists a normalization flavor whose
2259            // complex decompositions take more than 0xFFF but fewer than 0x1FFF code points
2260            // of space. If a good use case from such a decomposition flavor arises, we can
2261            // dynamically change the bit masks so that the length mask becomes 0x1FFF instead
2262            // of 0xFFF and the all-non-starters mask becomes 0 instead of 0x1000. However,
2263            // since for now the masks are hard-coded, error out.
2264            return Err(
2265                DataError::custom("future extension").with_marker(NormalizerNfdTablesV1::INFO)
2266            );
2267        }
2268
2269        let cap = decompositions.get().passthrough_cap;
2270        if cap > 0x0300 {
2271            return Err(DataError::custom("invalid").with_marker(NormalizerNfdDataV1::INFO));
2272        }
2273        let decomposition_capped = cap.min(0xC0);
2274        let composition_capped = cap.min(0x0300);
2275
2276        Ok(DecomposingNormalizer {
2277            decompositions,
2278            tables,
2279            supplementary_tables: None,
2280            decomposition_passthrough_bound: decomposition_capped as u8,
2281            composition_passthrough_bound: composition_capped,
2282        })
2283    }
2284
2285    icu_provider::gen_buffer_data_constructors!(
2286        () -> error: DataError,
2287        functions: [
2288            new_nfkd: skip,
2289            try_new_nfkd_with_buffer_provider,
2290            try_new_nfkd_unstable,
2291            Self,
2292        ]
2293    );
2294
2295    /// NFKD constructor using compiled data.
2296    ///
2297    /// ✨ *Enabled with the `compiled_data` Cargo feature.*
2298    ///
2299    /// [📚 Help choosing a constructor](icu_provider::constructors)
2300    #[cfg(feature = "compiled_data")]
2301    pub const fn new_nfkd() -> DecomposingNormalizerBorrowed<'static> {
2302        DecomposingNormalizerBorrowed::new_nfkd()
2303    }
2304
2305    #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_nfkd)]
2306    pub fn try_new_nfkd_unstable<D>(provider: &D) -> Result<Self, DataError>
2307    where
2308        D: DataProvider<NormalizerNfkdDataV1>
2309            + DataProvider<NormalizerNfdTablesV1>
2310            + DataProvider<NormalizerNfkdTablesV1>
2311            + ?Sized,
2312    {
2313        let decompositions: DataPayload<NormalizerNfkdDataV1> =
2314            provider.load(Default::default())?.payload;
2315        let tables: DataPayload<NormalizerNfdTablesV1> = provider.load(Default::default())?.payload;
2316        let supplementary_tables: DataPayload<NormalizerNfkdTablesV1> =
2317            provider.load(Default::default())?.payload;
2318
2319        if tables.get().scalars16.len()
2320            + tables.get().scalars24.len()
2321            + supplementary_tables.get().scalars16.len()
2322            + supplementary_tables.get().scalars24.len()
2323            > 0xFFF
2324        {
2325            // The data is from a future where there exists a normalization flavor whose
2326            // complex decompositions take more than 0xFFF but fewer than 0x1FFF code points
2327            // of space. If a good use case from such a decomposition flavor arises, we can
2328            // dynamically change the bit masks so that the length mask becomes 0x1FFF instead
2329            // of 0xFFF and the all-non-starters mask becomes 0 instead of 0x1000. However,
2330            // since for now the masks are hard-coded, error out.
2331            return Err(
2332                DataError::custom("future extension").with_marker(NormalizerNfdTablesV1::INFO)
2333            );
2334        }
2335
2336        let cap = decompositions.get().passthrough_cap;
2337        if cap > 0x0300 {
2338            return Err(DataError::custom("invalid").with_marker(NormalizerNfkdDataV1::INFO));
2339        }
2340        let decomposition_capped = cap.min(0xC0);
2341        let composition_capped = cap.min(0x0300);
2342
2343        Ok(DecomposingNormalizer {
2344            decompositions: decompositions.cast(),
2345            tables,
2346            supplementary_tables: Some(supplementary_tables),
2347            decomposition_passthrough_bound: decomposition_capped as u8,
2348            composition_passthrough_bound: composition_capped,
2349        })
2350    }
2351
2352    /// UTS 46 decomposed constructor (testing only)
2353    ///
2354    /// This is a special building block normalization for IDNA. It is the decomposed counterpart of
2355    /// ICU4C's UTS 46 normalization with two exceptions: characters that UTS 46 disallows and
2356    /// ICU4C maps to U+FFFD and characters that UTS 46 maps to the empty string normalize as in
2357    /// NFD in this normalization. In both cases, the previous UTS 46 processing before using
2358    /// normalization is expected to deal with these characters. Making the disallowed characters
2359    /// behave like this is beneficial to data size, and this normalizer implementation cannot
2360    /// deal with a character normalizing to the empty string, which doesn't happen in NFD or
2361    /// NFKD as of Unicode 14.
2362    ///
2363    /// Warning: In this normalization, U+0345 COMBINING GREEK YPOGEGRAMMENI exhibits a behavior
2364    /// that no character in Unicode exhibits in NFD, NFKD, NFC, or NFKC: Case folding turns
2365    /// U+0345 from a reordered character into a non-reordered character before reordering happens.
2366    /// Therefore, the output of this normalization may differ for different inputs that are
2367    /// canonically equivalent with each other if they differ by how U+0345 is ordered relative
2368    /// to other reorderable characters.
2369    pub(crate) fn try_new_uts46_decomposed_unstable<D>(provider: &D) -> Result<Self, DataError>
2370    where
2371        D: DataProvider<NormalizerUts46DataV1>
2372            + DataProvider<NormalizerNfdTablesV1>
2373            + DataProvider<NormalizerNfkdTablesV1>
2374            // UTS 46 tables merged into CompatibilityDecompositionTablesV1
2375            + ?Sized,
2376    {
2377        let decompositions: DataPayload<NormalizerUts46DataV1> =
2378            provider.load(Default::default())?.payload;
2379        let tables: DataPayload<NormalizerNfdTablesV1> = provider.load(Default::default())?.payload;
2380        let supplementary_tables: DataPayload<NormalizerNfkdTablesV1> =
2381            provider.load(Default::default())?.payload;
2382
2383        if tables.get().scalars16.len()
2384            + tables.get().scalars24.len()
2385            + supplementary_tables.get().scalars16.len()
2386            + supplementary_tables.get().scalars24.len()
2387            > 0xFFF
2388        {
2389            // The data is from a future where there exists a normalization flavor whose
2390            // complex decompositions take more than 0xFFF but fewer than 0x1FFF code points
2391            // of space. If a good use case from such a decomposition flavor arises, we can
2392            // dynamically change the bit masks so that the length mask becomes 0x1FFF instead
2393            // of 0xFFF and the all-non-starters mask becomes 0 instead of 0x1000. However,
2394            // since for now the masks are hard-coded, error out.
2395            return Err(
2396                DataError::custom("future extension").with_marker(NormalizerNfdTablesV1::INFO)
2397            );
2398        }
2399
2400        let cap = decompositions.get().passthrough_cap;
2401        if cap > 0x0300 {
2402            return Err(DataError::custom("invalid").with_marker(NormalizerUts46DataV1::INFO));
2403        }
2404        let decomposition_capped = cap.min(0xC0);
2405        let composition_capped = cap.min(0x0300);
2406
2407        Ok(DecomposingNormalizer {
2408            decompositions: decompositions.cast(),
2409            tables,
2410            supplementary_tables: Some(supplementary_tables),
2411            decomposition_passthrough_bound: decomposition_capped as u8,
2412            composition_passthrough_bound: composition_capped,
2413        })
2414    }
2415}
2416
2417/// Borrowed version of a normalizer for performing composing normalization.
2418#[derive(Debug)]
2419pub struct ComposingNormalizerBorrowed<'a> {
2420    decomposing_normalizer: DecomposingNormalizerBorrowed<'a>,
2421    canonical_compositions: &'a CanonicalCompositions<'a>,
2422}
2423
2424impl ComposingNormalizerBorrowed<'static> {
2425    /// Cheaply converts a [`ComposingNormalizerBorrowed<'static>`] into a [`ComposingNormalizer`].
2426    ///
2427    /// Note: Due to branching and indirection, using [`ComposingNormalizer`] might inhibit some
2428    /// compile-time optimizations that are possible with [`ComposingNormalizerBorrowed`].
2429    pub const fn static_to_owned(self) -> ComposingNormalizer {
2430        ComposingNormalizer {
2431            decomposing_normalizer: self.decomposing_normalizer.static_to_owned(),
2432            canonical_compositions: DataPayload::from_static_ref(self.canonical_compositions),
2433        }
2434    }
2435
2436    /// NFC constructor using compiled data.
2437    ///
2438    /// ✨ *Enabled with the `compiled_data` Cargo feature.*
2439    ///
2440    /// [📚 Help choosing a constructor](icu_provider::constructors)
2441    #[cfg(feature = "compiled_data")]
2442    pub const fn new_nfc() -> Self {
2443        ComposingNormalizerBorrowed {
2444            decomposing_normalizer: DecomposingNormalizerBorrowed::new_nfd(),
2445            canonical_compositions: provider::Baked::SINGLETON_NORMALIZER_NFC_V1,
2446        }
2447    }
2448
2449    /// NFKC constructor using compiled data.
2450    ///
2451    /// ✨ *Enabled with the `compiled_data` Cargo feature.*
2452    ///
2453    /// [📚 Help choosing a constructor](icu_provider::constructors)
2454    #[cfg(feature = "compiled_data")]
2455    pub const fn new_nfkc() -> Self {
2456        ComposingNormalizerBorrowed {
2457            decomposing_normalizer: DecomposingNormalizerBorrowed::new_nfkd(),
2458            canonical_compositions: provider::Baked::SINGLETON_NORMALIZER_NFC_V1,
2459        }
2460    }
2461
2462    /// This is a special building block normalization for IDNA that implements parts of the Map
2463    /// step and the following Normalize step.
2464    ///
2465    /// Warning: In this normalization, U+0345 COMBINING GREEK YPOGEGRAMMENI exhibits a behavior
2466    /// that no character in Unicode exhibits in NFD, NFKD, NFC, or NFKC: Case folding turns
2467    /// U+0345 from a reordered character into a non-reordered character before reordering happens.
2468    /// Therefore, the output of this normalization may differ for different inputs that are
2469    /// canonically equivalents with each other if they differ by how U+0345 is ordered relative
2470    /// to other reorderable characters.
2471    #[cfg(feature = "compiled_data")]
2472    pub(crate) const fn new_uts46() -> Self {
2473        ComposingNormalizerBorrowed {
2474            decomposing_normalizer: DecomposingNormalizerBorrowed::new_uts46_decomposed(),
2475            canonical_compositions: provider::Baked::SINGLETON_NORMALIZER_NFC_V1,
2476        }
2477    }
2478}
2479
2480impl<'data> ComposingNormalizerBorrowed<'data> {
2481    /// Wraps a delegate iterator into a composing iterator
2482    /// adapter by using the data already held by this normalizer.
2483    pub fn normalize_iter<I: Iterator<Item = char>>(&self, iter: I) -> Composition<'data, I> {
2484        self.normalize_iter_private(iter, IgnorableBehavior::Unsupported)
2485    }
2486
2487    fn normalize_iter_private<I: Iterator<Item = char>>(
2488        &self,
2489        iter: I,
2490        ignorable_behavior: IgnorableBehavior,
2491    ) -> Composition<'data, I> {
2492        Composition::new(
2493            Decomposition::new_with_supplements(
2494                iter,
2495                self.decomposing_normalizer.decompositions,
2496                self.decomposing_normalizer.tables,
2497                self.decomposing_normalizer.supplementary_tables,
2498                self.decomposing_normalizer.decomposition_passthrough_bound,
2499                ignorable_behavior,
2500            ),
2501            self.canonical_compositions.canonical_compositions.clone(),
2502            self.decomposing_normalizer.composition_passthrough_bound,
2503        )
2504    }
2505
2506    normalizer_methods!();
2507
2508    composing_normalize_to!(
2509        /// Normalize a string slice into a `Write` sink.
2510        ,
2511        normalize_to,
2512        core::fmt::Write,
2513        &str,
2514        {},
2515        true,
2516        as_str,
2517        {
2518            // Let's hope LICM hoists this outside `'outer`.
2519            let composition_passthrough_byte_bound = if composition_passthrough_bound == 0x300 {
2520                0xCCu8
2521            } else {
2522                // We can make this fancy if a normalization other than NFC where looking at
2523                // non-ASCII lead bytes is worthwhile is ever introduced.
2524                composition_passthrough_bound.min(0x80) as u8
2525            };
2526            // Attributes have to be on blocks, so hoisting all the way here.
2527            #[expect(clippy::unwrap_used)]
2528            'fast: loop {
2529                let mut code_unit_iter = composition.decomposition.delegate.as_str().as_bytes().iter();
2530                'fastest: loop {
2531                    if let Some(&upcoming_byte) = code_unit_iter.next() {
2532                        if upcoming_byte < composition_passthrough_byte_bound {
2533                            // Fast-track succeeded!
2534                            continue 'fastest;
2535                        }
2536                        let Some(remaining_slice) = pending_slice.get(pending_slice.len() - code_unit_iter.as_slice().len() - 1..) else {
2537                            // If we ever come here, it's an internal bug. Let's avoid panic code paths in release builds.
2538                            debug_assert!(false);
2539                            // Throw away the fastest-path result in case of an internal bug.
2540                            break 'fastest;
2541                        };
2542                        composition.decomposition.delegate = remaining_slice.chars();
2543                        break 'fastest;
2544                    }
2545                    // End of stream
2546                    sink.write_str(pending_slice)?;
2547                    return Ok(());
2548                }
2549                // `unwrap()` OK, because the slice is valid UTF-8 and we know there
2550                // is an upcoming byte.
2551                let upcoming = composition.decomposition.delegate.next().unwrap();
2552                let upcoming_with_trie_value = composition.decomposition.attach_trie_value(upcoming);
2553                if upcoming_with_trie_value.potential_passthrough_and_cannot_combine_backwards() {
2554                    // Can't combine backwards, hence a plain (non-backwards-combining)
2555                    // starter albeit past `composition_passthrough_bound`
2556
2557                    // Fast-track succeeded!
2558                    continue 'fast;
2559                }
2560                // We need to fall off the fast path.
2561                composition.decomposition.pending = Some(upcoming_with_trie_value);
2562
2563                // slicing and unwrap OK, because we've just evidently read enough previously.
2564                let mut consumed_so_far = pending_slice[..pending_slice.len() - composition.decomposition.delegate.as_str().len() - upcoming.len_utf8()].chars();
2565                // `unwrap` OK, because we've previously manage to read the previous character
2566                undecomposed_starter = composition.decomposition.attach_trie_value(consumed_so_far.next_back().unwrap());
2567                let consumed_so_far_slice = consumed_so_far.as_str();
2568                sink.write_str(consumed_so_far_slice)?;
2569                break 'fast;
2570            }
2571        },
2572        text,
2573        sink,
2574        composition,
2575        composition_passthrough_bound,
2576        undecomposed_starter,
2577        pending_slice,
2578        len_utf8,
2579    );
2580
2581    composing_normalize_to!(
2582        /// Normalize a slice of potentially-invalid UTF-8 into a `Write` sink.
2583        ///
2584        /// Ill-formed byte sequences are mapped to the REPLACEMENT CHARACTER
2585        /// according to the WHATWG Encoding Standard.
2586        ///
2587        /// ✨ *Enabled with the `utf8_iter` Cargo feature.*
2588        #[cfg(feature = "utf8_iter")]
2589        ,
2590        normalize_utf8_to,
2591        core::fmt::Write,
2592        &[u8],
2593        {},
2594        false,
2595        as_slice,
2596        {
2597            'fast: loop {
2598                if let Some(upcoming) = composition.decomposition.delegate.next() {
2599                    if u32::from(upcoming) < composition_passthrough_bound {
2600                        // Fast-track succeeded!
2601                        continue 'fast;
2602                    }
2603                    // TODO: Be statically aware of fast/small trie.
2604                    let upcoming_with_trie_value = composition.decomposition.attach_trie_value(upcoming);
2605                    if upcoming_with_trie_value.potential_passthrough_and_cannot_combine_backwards() {
2606                        // Note: The trie value of the REPLACEMENT CHARACTER is
2607                        // intentionally formatted to fail the
2608                        // `potential_passthrough_and_cannot_combine_backwards`
2609                        // test even though it really is a starter that decomposes
2610                        // to self and cannot combine backwards. This
2611                        // Allows moving the branch on REPLACEMENT CHARACTER
2612                        // below this `continue`.
2613                        continue 'fast;
2614                    }
2615                    // We need to fall off the fast path.
2616
2617                    // TODO(#2006): Annotate as unlikely
2618                    if upcoming == REPLACEMENT_CHARACTER {
2619                        // Can't tell if this is an error or a literal U+FFFD in
2620                        // the input. Assuming the former to be sure.
2621
2622                        // Since the U+FFFD might signify an error, we can't
2623                        // assume `upcoming.len_utf8()` for the backoff length.
2624                        #[expect(clippy::indexing_slicing)]
2625                        let mut consumed_so_far = pending_slice[..pending_slice.len() - composition.decomposition.delegate.as_slice().len()].chars();
2626                        let back = consumed_so_far.next_back();
2627                        debug_assert_eq!(back, Some(REPLACEMENT_CHARACTER));
2628                        let consumed_so_far_slice = consumed_so_far.as_slice();
2629                        sink.write_str(unsafe { core::str::from_utf8_unchecked(consumed_so_far_slice) })?;
2630                        undecomposed_starter = CharacterAndTrieValue::new(REPLACEMENT_CHARACTER, 0);
2631                        composition.decomposition.pending = None;
2632                        break 'fast;
2633                    }
2634
2635                    composition.decomposition.pending = Some(upcoming_with_trie_value);
2636                    // slicing and unwrap OK, because we've just evidently read enough previously.
2637                    // `unwrap` OK, because we've previously manage to read the previous character
2638                    #[expect(clippy::indexing_slicing)]
2639                    let mut consumed_so_far = pending_slice[..pending_slice.len() - composition.decomposition.delegate.as_slice().len() - upcoming.len_utf8()].chars();
2640                    #[expect(clippy::unwrap_used)]
2641                    {
2642                        // TODO: If the previous character was below the passthrough bound,
2643                        // we really need to read from the trie. Otherwise, we could maintain
2644                        // the most-recent trie value. Need to measure what's more expensive:
2645                        // Remembering the trie value on each iteration or re-reading the
2646                        // last one after the fast-track run.
2647                        undecomposed_starter = composition.decomposition.attach_trie_value(consumed_so_far.next_back().unwrap());
2648                    }
2649                    let consumed_so_far_slice = consumed_so_far.as_slice();
2650                    sink.write_str(unsafe { core::str::from_utf8_unchecked(consumed_so_far_slice)})?;
2651                    break 'fast;
2652                }
2653                // End of stream
2654                sink.write_str(unsafe { core::str::from_utf8_unchecked(pending_slice) })?;
2655                return Ok(());
2656            }
2657        },
2658        text,
2659        sink,
2660        composition,
2661        composition_passthrough_bound,
2662        undecomposed_starter,
2663        pending_slice,
2664        len_utf8,
2665    );
2666
2667    composing_normalize_to!(
2668        /// Normalize a slice of potentially-invalid UTF-16 into a `Write16` sink.
2669        ///
2670        /// Unpaired surrogates are mapped to the REPLACEMENT CHARACTER
2671        /// before normalizing.
2672        ///
2673        /// ✨ *Enabled with the `utf16_iter` Cargo feature.*
2674        #[cfg(feature = "utf16_iter")]
2675        ,
2676        normalize_utf16_to,
2677        write16::Write16,
2678        &[u16],
2679        {
2680            sink.size_hint(text.len())?;
2681        },
2682        false,
2683        as_slice,
2684        {
2685            // This loop is only broken out of as goto forward and only as release-build recovery from
2686            // detecting an internal bug without panic. (In debug builds, internal bugs panic instead.)
2687            #[expect(clippy::never_loop)]
2688            'fastwrap: loop {
2689                // Commented out `code_unit_iter` and used `ptr` and `end` to
2690                // work around https://github.com/rust-lang/rust/issues/144684 .
2691                //
2692                // let mut code_unit_iter = composition.decomposition.delegate.as_slice().iter();
2693                let delegate_as_slice = composition.decomposition.delegate.as_slice();
2694                let mut ptr: *const u16 = delegate_as_slice.as_ptr();
2695                // SAFETY: materializing a pointer immediately past the end of an
2696                // allocation is OK.
2697                let end: *const u16 = unsafe { ptr.add(delegate_as_slice.len()) };
2698
2699                'fast: loop {
2700                    // if let Some(&upcoming_code_unit) = code_unit_iter.next() {
2701                    if ptr != end {
2702                        // SAFETY: We just checked that `ptr` has not reached `end`.
2703                        // `ptr` always advances by one, and we always have a check
2704                        // per advancement.
2705                        let upcoming_code_unit = unsafe { *ptr };
2706                        // SAFETY: Since `ptr` hadn't reached `end`, yet, advancing
2707                        // by one points to the same allocation or to immediately
2708                        // after, which is OK.
2709                        ptr = unsafe { ptr.add(1) };
2710
2711                        let mut upcoming32 = u32::from(upcoming_code_unit); // may be surrogate
2712                        // The performance of what logically is supposed to be this
2713                        // branch is somewhat brittle and what LLVM ends up doing
2714                        // that affects the performance of what's logically about this
2715                        // decision can swing to double/halve the throughput for Basic
2716                        // Latin in ways that are completely unintuitive. Basically _any_
2717                        // change to _any_ code that participates in how LLVM sees the
2718                        // code around here can make the perf fall over. In seems that
2719                        // manually annotating this branch as likely has worse effects
2720                        // on non-Basic-Latin input that the case where LLVM just happens to
2721                        // do the right thing.
2722                        //
2723                        // What happens with this branch may depend on what sink type
2724                        // this code is monomorphized over.
2725                        //
2726                        // What a terrible sink of developer time!
2727                        if upcoming32 < composition_passthrough_bound {
2728                            // No need for surrogate or U+FFFD check, because
2729                            // `composition_passthrough_bound` cannot be higher than
2730                            // U+0300.
2731                            // Fast-track succeeded!
2732                            continue 'fast;
2733                        }
2734                        // We might be doing a trie lookup by surrogate. Surrogates get
2735                        // a decomposition to U+FFFD.
2736                        let mut trie_value = composition.decomposition.trie.get16(upcoming_code_unit);
2737                        if potential_passthrough_and_cannot_combine_backwards_impl(trie_value) {
2738                            // Can't combine backwards, hence a plain (non-backwards-combining)
2739                            // starter albeit past `composition_passthrough_bound`
2740
2741                            // Fast-track succeeded!
2742                            continue 'fast;
2743                        }
2744
2745                        // We might now be looking at a surrogate.
2746                        // The loop is only broken out of as goto forward
2747                        #[expect(clippy::never_loop)]
2748                        'surrogateloop: loop {
2749                            // The `likely` annotations _below_ exist to make the code _above_
2750                            // go faster!
2751                            let surrogate_base = upcoming32.wrapping_sub(0xD800);
2752                            if likely(surrogate_base > (0xDFFF - 0xD800)) {
2753                                // Not surrogate
2754                                break 'surrogateloop;
2755                            }
2756                            if likely(surrogate_base <= (0xDBFF - 0xD800)) {
2757                                // let iter_backup = code_unit_iter.clone();
2758                                // if let Some(&low) = code_unit_iter.next() {
2759                                if ptr != end {
2760                                    // SAFETY: We just checked that `ptr` has not reached `end`.
2761                                    // `ptr` always advances by one, and we always have a check
2762                                    // per advancement.
2763                                    let low = unsafe { *ptr };
2764                                    if likely(in_inclusive_range16(low, 0xDC00, 0xDFFF)) {
2765                                        // SAFETY: Since `ptr` hadn't reached `end`, yet, advancing
2766                                        // by one points to the same allocation or to immediately
2767                                        // after, which is OK.
2768                                        ptr = unsafe { ptr.add(1) };
2769
2770                                        upcoming32 = (upcoming32 << 10) + u32::from(low)
2771                                            - (((0xD800u32 << 10) - 0x10000u32) + 0xDC00u32);
2772                                        // Successfully-paired surrogate. Read from the trie again.
2773                                        trie_value = {
2774                                            // Semantically, this bit of conditional compilation makes no sense.
2775                                            // The purpose is to keep LLVM seeing the untyped trie case the way
2776                                            // it did before so as not to regress the performance of the untyped
2777                                            // case due to unintuitive optimizer effects. If you care about the
2778                                            // perf of the untyped trie case and have better ideas, please try
2779                                            // something better.
2780                                            #[cfg(not(icu4x_unstable_fast_trie_only))]
2781                                            {composition.decomposition.trie.get32(upcoming32)}
2782                                            #[cfg(icu4x_unstable_fast_trie_only)]
2783                                            {composition.decomposition.trie.get32_supplementary(upcoming32)}
2784                                        };
2785                                        if likely(potential_passthrough_and_cannot_combine_backwards_impl(trie_value)) {
2786                                            // Fast-track succeeded!
2787                                            continue 'fast;
2788                                        }
2789                                        break 'surrogateloop;
2790                                    // } else {
2791                                    //     code_unit_iter = iter_backup;
2792                                    }
2793                                }
2794                            }
2795                            // unpaired surrogate
2796                            upcoming32 = 0xFFFD; // Safe value for `char::from_u32_unchecked` and matches later potential error check.
2797                            // trie_value already holds a decomposition to U+FFFD.
2798                            debug_assert_eq!(trie_value, NON_ROUND_TRIP_MARKER | BACKWARD_COMBINING_MARKER | 0xFFFD);
2799                            break 'surrogateloop;
2800                        }
2801
2802                        // SAFETY: upcoming32 can no longer be a surrogate.
2803                        let upcoming = unsafe { char::from_u32_unchecked(upcoming32) };
2804                        let upcoming_with_trie_value = CharacterAndTrieValue::new(upcoming, trie_value);
2805                        // We need to fall off the fast path.
2806                        composition.decomposition.pending = Some(upcoming_with_trie_value);
2807                        let Some(consumed_so_far_slice) = pending_slice.get(..pending_slice.len() -
2808                            // code_unit_iter.as_slice().len()
2809                            // SAFETY: `ptr` and `end` have been derived from the same allocation
2810                            // and `ptr` is never greater than `end`.
2811                            unsafe { end.offset_from(ptr) as usize }
2812                            - upcoming.len_utf16()) else {
2813                            // If we ever come here, it's a bug, but let's avoid panic code paths in release builds.
2814                            debug_assert!(false);
2815                            // Throw away the results of the fast path.
2816                            break 'fastwrap;
2817                        };
2818                        let mut consumed_so_far = consumed_so_far_slice.chars();
2819                        let Some(c_from_back) = consumed_so_far.next_back() else {
2820                            // If we ever come here, it's a bug, but let's avoid panic code paths in release builds.
2821                            debug_assert!(false);
2822                            // Throw away the results of the fast path.
2823                            break 'fastwrap;
2824                        };
2825                        // TODO: If the previous character was below the passthrough bound,
2826                        // we really need to read from the trie. Otherwise, we could maintain
2827                        // the most-recent trie value. Need to measure what's more expensive:
2828                        // Remembering the trie value on each iteration or re-reading the
2829                        // last one after the fast-track run.
2830                        undecomposed_starter = composition.decomposition.attach_trie_value(c_from_back);
2831                        sink.write_slice(consumed_so_far.as_slice())?;
2832                        break 'fast;
2833                    }
2834                    // End of stream
2835                    sink.write_slice(pending_slice)?;
2836                    return Ok(());
2837                }
2838                // Sync the main iterator
2839                // composition.decomposition.delegate = code_unit_iter.as_slice().chars();
2840                // SAFETY: `ptr` and `end` have been derive from the same allocation
2841                // and `ptr` is never greater than `end`.
2842                composition.decomposition.delegate = unsafe { core::slice::from_raw_parts(ptr, end.offset_from(ptr) as usize) }.chars();
2843                break 'fastwrap;
2844            }
2845        },
2846        text,
2847        sink,
2848        composition,
2849        composition_passthrough_bound,
2850        undecomposed_starter,
2851        pending_slice,
2852        len_utf16,
2853    );
2854}
2855
2856/// A normalizer for performing composing normalization.
2857#[derive(Debug)]
2858pub struct ComposingNormalizer {
2859    decomposing_normalizer: DecomposingNormalizer,
2860    canonical_compositions: DataPayload<NormalizerNfcV1>,
2861}
2862
2863impl ComposingNormalizer {
2864    /// Constructs a borrowed version of this type for more efficient querying.
2865    pub fn as_borrowed(&self) -> ComposingNormalizerBorrowed<'_> {
2866        ComposingNormalizerBorrowed {
2867            decomposing_normalizer: self.decomposing_normalizer.as_borrowed(),
2868            canonical_compositions: self.canonical_compositions.get(),
2869        }
2870    }
2871
2872    /// NFC constructor using compiled data.
2873    ///
2874    /// ✨ *Enabled with the `compiled_data` Cargo feature.*
2875    ///
2876    /// [📚 Help choosing a constructor](icu_provider::constructors)
2877    #[cfg(feature = "compiled_data")]
2878    pub const fn new_nfc() -> ComposingNormalizerBorrowed<'static> {
2879        ComposingNormalizerBorrowed::new_nfc()
2880    }
2881
2882    icu_provider::gen_buffer_data_constructors!(
2883        () -> error: DataError,
2884        functions: [
2885            new_nfc: skip,
2886            try_new_nfc_with_buffer_provider,
2887            try_new_nfc_unstable,
2888            Self,
2889        ]
2890    );
2891
2892    #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_nfc)]
2893    pub fn try_new_nfc_unstable<D>(provider: &D) -> Result<Self, DataError>
2894    where
2895        D: DataProvider<NormalizerNfdDataV1>
2896            + DataProvider<NormalizerNfdTablesV1>
2897            + DataProvider<NormalizerNfcV1>
2898            + ?Sized,
2899    {
2900        let decomposing_normalizer = DecomposingNormalizer::try_new_nfd_unstable(provider)?;
2901
2902        let canonical_compositions: DataPayload<NormalizerNfcV1> =
2903            provider.load(Default::default())?.payload;
2904
2905        Ok(ComposingNormalizer {
2906            decomposing_normalizer,
2907            canonical_compositions,
2908        })
2909    }
2910
2911    /// NFKC constructor using compiled data.
2912    ///
2913    /// ✨ *Enabled with the `compiled_data` Cargo feature.*
2914    ///
2915    /// [📚 Help choosing a constructor](icu_provider::constructors)
2916    #[cfg(feature = "compiled_data")]
2917    pub const fn new_nfkc() -> ComposingNormalizerBorrowed<'static> {
2918        ComposingNormalizerBorrowed::new_nfkc()
2919    }
2920
2921    icu_provider::gen_buffer_data_constructors!(
2922        () -> error: DataError,
2923        functions: [
2924            new_nfkc: skip,
2925            try_new_nfkc_with_buffer_provider,
2926            try_new_nfkc_unstable,
2927            Self,
2928        ]
2929    );
2930
2931    #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_nfkc)]
2932    pub fn try_new_nfkc_unstable<D>(provider: &D) -> Result<Self, DataError>
2933    where
2934        D: DataProvider<NormalizerNfkdDataV1>
2935            + DataProvider<NormalizerNfdTablesV1>
2936            + DataProvider<NormalizerNfkdTablesV1>
2937            + DataProvider<NormalizerNfcV1>
2938            + ?Sized,
2939    {
2940        let decomposing_normalizer = DecomposingNormalizer::try_new_nfkd_unstable(provider)?;
2941
2942        let canonical_compositions: DataPayload<NormalizerNfcV1> =
2943            provider.load(Default::default())?.payload;
2944
2945        Ok(ComposingNormalizer {
2946            decomposing_normalizer,
2947            canonical_compositions,
2948        })
2949    }
2950
2951    #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_uts46)]
2952    pub(crate) fn try_new_uts46_unstable<D>(provider: &D) -> Result<Self, DataError>
2953    where
2954        D: DataProvider<NormalizerUts46DataV1>
2955            + DataProvider<NormalizerNfdTablesV1>
2956            + DataProvider<NormalizerNfkdTablesV1>
2957            // UTS 46 tables merged into CompatibilityDecompositionTablesV1
2958            + DataProvider<NormalizerNfcV1>
2959            + ?Sized,
2960    {
2961        let decomposing_normalizer =
2962            DecomposingNormalizer::try_new_uts46_decomposed_unstable(provider)?;
2963
2964        let canonical_compositions: DataPayload<NormalizerNfcV1> =
2965            provider.load(Default::default())?.payload;
2966
2967        Ok(ComposingNormalizer {
2968            decomposing_normalizer,
2969            canonical_compositions,
2970        })
2971    }
2972}
2973
2974#[cfg(feature = "utf16_iter")]
2975struct IsNormalizedSinkUtf16<'a> {
2976    expect: &'a [u16],
2977}
2978
2979#[cfg(feature = "utf16_iter")]
2980impl<'a> IsNormalizedSinkUtf16<'a> {
2981    pub fn new(slice: &'a [u16]) -> Self {
2982        IsNormalizedSinkUtf16 { expect: slice }
2983    }
2984    pub fn remaining_len(&self) -> usize {
2985        self.expect.len()
2986    }
2987}
2988
2989#[cfg(feature = "utf16_iter")]
2990impl write16::Write16 for IsNormalizedSinkUtf16<'_> {
2991    fn write_slice(&mut self, s: &[u16]) -> core::fmt::Result {
2992        // We know that if we get a slice, it's a pass-through,
2993        // so we can compare addresses. Indexing is OK, because
2994        // an indexing failure would be a code bug rather than
2995        // an input or data issue.
2996        #[expect(clippy::indexing_slicing)]
2997        if core::ptr::eq(s.as_ptr(), self.expect.as_ptr()) {
2998            self.expect = &self.expect[s.len()..];
2999            Ok(())
3000        } else {
3001            Err(core::fmt::Error {})
3002        }
3003    }
3004
3005    fn write_char(&mut self, c: char) -> core::fmt::Result {
3006        let mut iter = self.expect.chars();
3007        if iter.next() == Some(c) {
3008            self.expect = iter.as_slice();
3009            Ok(())
3010        } else {
3011            Err(core::fmt::Error {})
3012        }
3013    }
3014}
3015
3016#[cfg(feature = "utf8_iter")]
3017struct IsNormalizedSinkUtf8<'a> {
3018    expect: &'a [u8],
3019}
3020
3021#[cfg(feature = "utf8_iter")]
3022impl<'a> IsNormalizedSinkUtf8<'a> {
3023    pub fn new(slice: &'a [u8]) -> Self {
3024        IsNormalizedSinkUtf8 { expect: slice }
3025    }
3026    pub fn remaining_len(&self) -> usize {
3027        self.expect.len()
3028    }
3029}
3030
3031#[cfg(feature = "utf8_iter")]
3032impl core::fmt::Write for IsNormalizedSinkUtf8<'_> {
3033    fn write_str(&mut self, s: &str) -> core::fmt::Result {
3034        // We know that if we get a slice, it's a pass-through,
3035        // so we can compare addresses. Indexing is OK, because
3036        // an indexing failure would be a code bug rather than
3037        // an input or data issue.
3038        #[expect(clippy::indexing_slicing)]
3039        if core::ptr::eq(s.as_ptr(), self.expect.as_ptr()) {
3040            self.expect = &self.expect[s.len()..];
3041            Ok(())
3042        } else {
3043            Err(core::fmt::Error {})
3044        }
3045    }
3046
3047    fn write_char(&mut self, c: char) -> core::fmt::Result {
3048        let mut iter = self.expect.chars();
3049        if iter.next() == Some(c) {
3050            self.expect = iter.as_slice();
3051            Ok(())
3052        } else {
3053            Err(core::fmt::Error {})
3054        }
3055    }
3056}
3057
3058struct IsNormalizedSinkStr<'a> {
3059    expect: &'a str,
3060}
3061
3062impl<'a> IsNormalizedSinkStr<'a> {
3063    pub fn new(slice: &'a str) -> Self {
3064        IsNormalizedSinkStr { expect: slice }
3065    }
3066    pub fn remaining_len(&self) -> usize {
3067        self.expect.len()
3068    }
3069}
3070
3071impl core::fmt::Write for IsNormalizedSinkStr<'_> {
3072    fn write_str(&mut self, s: &str) -> core::fmt::Result {
3073        // We know that if we get a slice, it's a pass-through,
3074        // so we can compare addresses. Indexing is OK, because
3075        // an indexing failure would be a code bug rather than
3076        // an input or data issue.
3077        if core::ptr::eq(s.as_ptr(), self.expect.as_ptr()) {
3078            self.expect = &self.expect[s.len()..];
3079            Ok(())
3080        } else {
3081            Err(core::fmt::Error {})
3082        }
3083    }
3084
3085    fn write_char(&mut self, c: char) -> core::fmt::Result {
3086        let mut iter = self.expect.chars();
3087        if iter.next() == Some(c) {
3088            self.expect = iter.as_str();
3089            Ok(())
3090        } else {
3091            Err(core::fmt::Error {})
3092        }
3093    }
3094}
icu_normalizer/lib.rs

icu_normalizer/
lib.rs