icu_normalizer/
lib.rs

1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5// https://github.com/unicode-org/icu4x/blob/main/documents/process/boilerplate.md#library-annotations
6#![cfg_attr(not(any(test, doc)), no_std)]
7#![cfg_attr(
8    not(test),
9    deny(
10        clippy::indexing_slicing,
11        clippy::unwrap_used,
12        clippy::expect_used,
13        clippy::panic,
14        clippy::exhaustive_structs,
15        clippy::exhaustive_enums,
16        clippy::trivially_copy_pass_by_ref,
17        missing_debug_implementations,
18    )
19)]
20#![warn(missing_docs)]
21
22//! Normalizing text into Unicode Normalization Forms.
23//!
24//! This module is published as its own crate ([`icu_normalizer`](https://docs.rs/icu_normalizer/latest/icu_normalizer/))
25//! and as part of the [`icu`](https://docs.rs/icu/latest/icu/) crate. See the latter for more details on the ICU4X project.
26//!
27//! # Functionality
28//!
29//! The top level of the crate provides normalization of input into the four normalization forms defined in [UAX #15: Unicode
30//! Normalization Forms](https://www.unicode.org/reports/tr15/): NFC, NFD, NFKC, and NFKD.
31//!
32//! Three kinds of contiguous inputs are supported: known-well-formed UTF-8 (`&str`), potentially-not-well-formed UTF-8,
33//! and potentially-not-well-formed UTF-16. Additionally, an iterator over `char` can be wrapped in a normalizing iterator.
34//!
35//! The `uts46` module provides the combination of mapping and normalization operations for [UTS #46: Unicode IDNA
36//! Compatibility Processing](https://www.unicode.org/reports/tr46/). This functionality is not meant to be used by
37//! applications directly. Instead, it is meant as a building block for a full implementation of UTS #46, such as the
38//! [`idna`](https://docs.rs/idna/latest/idna/) crate.
39//!
40//! The `properties` module provides the non-recursive canonical decomposition operation on a per `char` basis and
41//! the canonical compositon operation given two `char`s. It also provides access to the Canonical Combining Class
42//! property. These operations are primarily meant for [HarfBuzz](https://harfbuzz.github.io/) via the
43//! [`icu_harfbuzz`](https://docs.rs/icu_harfbuzz/latest/icu_harfbuzz/) crate.
44//!
45//! Notably, this normalizer does _not_ provide the normalization “quick check” that can result in “maybe” in
46//! addition to “yes” and “no”. The normalization checks provided by this crate always give a definitive
47//! non-“maybe” answer.
48//!
49//! # Examples
50//!
51//! ```
52//! let nfc = icu_normalizer::ComposingNormalizerBorrowed::new_nfc();
53//! assert_eq!(nfc.normalize("a\u{0308}"), "ä");
54//! assert!(nfc.is_normalized("ä"));
55//!
56//! let nfd = icu_normalizer::DecomposingNormalizerBorrowed::new_nfd();
57//! assert_eq!(nfd.normalize("ä"), "a\u{0308}");
58//! assert!(!nfd.is_normalized("ä"));
59//! ```
60
61extern crate alloc;
62
63// TODO: The plan is to replace
64// `#[cfg(not(icu4x_unstable_fast_trie_only))]`
65// with
66// `#[cfg(feature = "serde")]`
67// and
68// `#[cfg(icu4x_unstable_fast_trie_only)]`
69// with
70// `#[cfg(not(feature = "serde"))]`
71//
72// Before doing so:
73// * The type of the UTS 46 trie needs to be
74//   disentangled from the type of the NFD/NFKD tries.
75//   This will involve a more generic iterator hidden
76//   inside the public iterator types.
77// * datagen needs to emit fast-mode tries for the
78//   NFD and NFKD tries.
79// * The markers and possibly the data struct type
80//   for NFD and NFKD need to be revised per policy.
81
82#[cfg(not(icu4x_unstable_fast_trie_only))]
83type Trie<'trie> = CodePointTrie<'trie, u32>;
84
85#[cfg(icu4x_unstable_fast_trie_only)]
86type Trie<'trie> = FastCodePointTrie<'trie, u32>;
87
88// We don't depend on icu_properties to minimize deps, but we want to be able
89// to ensure we're using the right CCC values
90macro_rules! ccc {
91    ($name:ident, $num:expr) => {
92        const {
93            #[cfg(feature = "icu_properties")]
94            if icu_properties::props::CanonicalCombiningClass::$name.to_icu4c_value() != $num {
95                panic!("icu_normalizer has incorrect ccc values")
96            }
97            CanonicalCombiningClass::from_icu4c_value($num)
98        }
99    };
100}
101
102pub mod properties;
103pub mod provider;
104pub mod uts46;
105
106use crate::provider::CanonicalCompositions;
107use crate::provider::DecompositionData;
108use crate::provider::NormalizerNfdDataV1;
109use crate::provider::NormalizerNfkdDataV1;
110use crate::provider::NormalizerUts46DataV1;
111use alloc::borrow::Cow;
112use alloc::string::String;
113use core::char::REPLACEMENT_CHARACTER;
114use icu_collections::char16trie::Char16Trie;
115use icu_collections::char16trie::Char16TrieIterator;
116use icu_collections::char16trie::TrieResult;
117#[cfg(not(icu4x_unstable_fast_trie_only))]
118use icu_collections::codepointtrie::CodePointTrie;
119#[cfg(icu4x_unstable_fast_trie_only)]
120use icu_collections::codepointtrie::FastCodePointTrie;
121#[cfg(icu4x_unstable_fast_trie_only)]
122use icu_collections::codepointtrie::TypedCodePointTrie;
123#[cfg(feature = "icu_properties")]
124use icu_properties::props::CanonicalCombiningClass;
125use icu_provider::prelude::*;
126use provider::DecompositionTables;
127use provider::NormalizerNfcV1;
128use provider::NormalizerNfdTablesV1;
129use provider::NormalizerNfkdTablesV1;
130use smallvec::SmallVec;
131#[cfg(feature = "utf16_iter")]
132use utf16_iter::Utf16CharsEx;
133#[cfg(feature = "utf8_iter")]
134use utf8_iter::Utf8CharsEx;
135use zerovec::{zeroslice, ZeroSlice};
136
137// The optimizations in the area where `likely` is used
138// are extremely brittle. `likely` is useful in the typed-trie
139// case on the UTF-16 fast path, but in order not to disturb
140// the untyped-trie case on the UTF-16 fast path, make the
141// annotations no-ops in the untyped-trie case.
142
143// `cold_path` and `likely` come from
144// https://github.com/rust-lang/hashbrown/commit/64bd7db1d1b148594edfde112cdb6d6260e2cfc3 .
145// See https://github.com/rust-lang/hashbrown/commit/64bd7db1d1b148594edfde112cdb6d6260e2cfc3#commitcomment-164768806
146// for permission to relicense under Unicode-3.0.
147
148#[cfg(all(icu4x_unstable_fast_trie_only, feature = "utf16_iter"))]
149#[inline(always)]
150#[cold]
151fn cold_path() {}
152
153#[cfg(all(icu4x_unstable_fast_trie_only, feature = "utf16_iter"))]
154#[inline(always)]
155pub(crate) fn likely(b: bool) -> bool {
156    if b {
157        true
158    } else {
159        cold_path();
160        false
161    }
162}
163
164// End import from https://github.com/rust-lang/hashbrown/commit/64bd7db1d1b148594edfde112cdb6d6260e2cfc3 .
165
166/// No-op for typed trie case.
167#[cfg(all(not(icu4x_unstable_fast_trie_only), feature = "utf16_iter"))]
168#[inline(always)]
169fn likely(b: bool) -> bool {
170    b
171}
172
173/// This type exists as a shim for icu_properties CanonicalCombiningClass when the crate is disabled
174/// It should not be exposed to users.
175#[cfg(not(feature = "icu_properties"))]
176#[derive(Copy, Clone, Eq, PartialEq, PartialOrd, Ord)]
177struct CanonicalCombiningClass(pub(crate) u8);
178
179#[cfg(not(feature = "icu_properties"))]
180impl CanonicalCombiningClass {
181    const fn from_icu4c_value(v: u8) -> Self {
182        Self(v)
183    }
184    const fn to_icu4c_value(self) -> u8 {
185        self.0
186    }
187}
188
189const CCC_NOT_REORDERED: CanonicalCombiningClass = ccc!(NotReordered, 0);
190const CCC_ABOVE: CanonicalCombiningClass = ccc!(Above, 230);
191
192/// Treatment of the ignorable marker (0xFFFFFFFF) in data.
193#[derive(Debug, PartialEq, Eq)]
194enum IgnorableBehavior {
195    /// 0xFFFFFFFF in data is not supported.
196    Unsupported,
197    /// Ignorables are ignored.
198    Ignored,
199    /// Ignorables are treated as singleton decompositions
200    /// to the REPLACEMENT CHARACTER.
201    ReplacementCharacter,
202}
203
204/// Marker for UTS 46 ignorables.
205///
206/// See trie-value-format.md
207const IGNORABLE_MARKER: u32 = 0xFFFFFFFF;
208
209/// Marker that the decomposition does not round trip via NFC.
210///
211/// See trie-value-format.md
212const NON_ROUND_TRIP_MARKER: u32 = 1 << 30;
213
214/// Marker that the first character of the decomposition
215/// can combine backwards.
216///
217/// See trie-value-format.md
218const BACKWARD_COMBINING_MARKER: u32 = 1 << 31;
219
220/// Mask for the bits have to be zero for this to be a BMP
221/// singleton decomposition, or value baked into the surrogate
222/// range.
223///
224/// See trie-value-format.md
225const HIGH_ZEROS_MASK: u32 = 0x3FFF0000;
226
227/// Mask for the bits have to be zero for this to be a complex
228/// decomposition.
229///
230/// See trie-value-format.md
231const LOW_ZEROS_MASK: u32 = 0xFFE0;
232
233/// Checks if a trie value carries a (non-zero) canonical
234/// combining class.
235///
236/// See trie-value-format.md
237fn trie_value_has_ccc(trie_value: u32) -> bool {
238    (trie_value & 0x3FFFFE00) == 0xD800
239}
240
241/// Checks if the trie signifies a special non-starter decomposition.
242///
243/// See trie-value-format.md
244fn trie_value_indicates_special_non_starter_decomposition(trie_value: u32) -> bool {
245    (trie_value & 0x3FFFFF00) == 0xD900
246}
247
248/// Checks if a trie value signifies a character whose decomposition
249/// starts with a non-starter.
250///
251/// See trie-value-format.md
252fn decomposition_starts_with_non_starter(trie_value: u32) -> bool {
253    trie_value_has_ccc(trie_value)
254}
255
256/// Extracts a canonical combining class (possibly zero) from a trie value.
257///
258/// See trie-value-format.md
259fn ccc_from_trie_value(trie_value: u32) -> CanonicalCombiningClass {
260    if trie_value_has_ccc(trie_value) {
261        CanonicalCombiningClass::from_icu4c_value(trie_value as u8)
262    } else {
263        CCC_NOT_REORDERED
264    }
265}
266
267/// The tail (everything after the first character) of the NFKD form U+FDFA
268/// as 16-bit units.
269static FDFA_NFKD: [u16; 17] = [
270    0x644, 0x649, 0x20, 0x627, 0x644, 0x644, 0x647, 0x20, 0x639, 0x644, 0x64A, 0x647, 0x20, 0x648,
271    0x633, 0x644, 0x645,
272];
273
274/// Marker value for U+FDFA in NFKD. (Unified with Hangul syllable marker,
275/// but they differ by `NON_ROUND_TRIP_MARKER`.)
276///
277/// See trie-value-format.md
278const FDFA_MARKER: u16 = 1;
279
280// These constants originate from page 143 of Unicode 14.0
281/// Syllable base
282const HANGUL_S_BASE: u32 = 0xAC00;
283/// Lead jamo base
284const HANGUL_L_BASE: u32 = 0x1100;
285/// Vowel jamo base
286const HANGUL_V_BASE: u32 = 0x1161;
287/// Trail jamo base (deliberately off by one to account for the absence of a trail)
288const HANGUL_T_BASE: u32 = 0x11A7;
289/// Lead jamo count
290const HANGUL_L_COUNT: u32 = 19;
291/// Vowel jamo count
292const HANGUL_V_COUNT: u32 = 21;
293/// Trail jamo count (deliberately off by one to account for the absence of a trail)
294const HANGUL_T_COUNT: u32 = 28;
295/// Vowel jamo count times trail jamo count
296const HANGUL_N_COUNT: u32 = 588;
297/// Syllable count
298const HANGUL_S_COUNT: u32 = 11172;
299
300/// One past the conjoining jamo block
301const HANGUL_JAMO_LIMIT: u32 = 0x1200;
302
303/// If `opt` is `Some`, unwrap it. If `None`, panic if debug assertions
304/// are enabled and return `default` if debug assertions are not enabled.
305///
306/// Use this only if the only reason why `opt` could be `None` is bogus
307/// data from the provider.
308#[inline(always)]
309fn unwrap_or_gigo<T>(opt: Option<T>, default: T) -> T {
310    if let Some(val) = opt {
311        val
312    } else {
313        // GIGO case
314        debug_assert!(false);
315        default
316    }
317}
318
319/// Convert a `u32` _obtained from data provider data_ to `char`.
320#[inline(always)]
321fn char_from_u32(u: u32) -> char {
322    unwrap_or_gigo(core::char::from_u32(u), REPLACEMENT_CHARACTER)
323}
324
325/// Convert a `u16` _obtained from data provider data_ to `char`.
326#[inline(always)]
327fn char_from_u16(u: u16) -> char {
328    char_from_u32(u32::from(u))
329}
330
331const EMPTY_U16: &ZeroSlice<u16> = zeroslice![];
332
333const EMPTY_CHAR: &ZeroSlice<char> = zeroslice![];
334
335#[inline(always)]
336fn in_inclusive_range(c: char, start: char, end: char) -> bool {
337    u32::from(c).wrapping_sub(u32::from(start)) <= (u32::from(end) - u32::from(start))
338}
339
340#[inline(always)]
341#[cfg(feature = "utf16_iter")]
342fn in_inclusive_range16(u: u16, start: u16, end: u16) -> bool {
343    u.wrapping_sub(start) <= (end - start)
344}
345
346/// Performs canonical composition (including Hangul) on a pair of
347/// characters or returns `None` if these characters don't compose.
348/// Composition exclusions are taken into account.
349#[inline]
350fn compose(iter: Char16TrieIterator, starter: char, second: char) -> Option<char> {
351    let v = u32::from(second).wrapping_sub(HANGUL_V_BASE);
352    if v >= HANGUL_JAMO_LIMIT - HANGUL_V_BASE {
353        return compose_non_hangul(iter, starter, second);
354    }
355    if v < HANGUL_V_COUNT {
356        let l = u32::from(starter).wrapping_sub(HANGUL_L_BASE);
357        if l < HANGUL_L_COUNT {
358            let lv = l * HANGUL_N_COUNT + v * HANGUL_T_COUNT;
359            // Safe, because the inputs are known to be in range.
360            return Some(unsafe { char::from_u32_unchecked(HANGUL_S_BASE + lv) });
361        }
362        return None;
363    }
364    if in_inclusive_range(second, '\u{11A8}', '\u{11C2}') {
365        let lv = u32::from(starter).wrapping_sub(HANGUL_S_BASE);
366        if lv < HANGUL_S_COUNT && lv % HANGUL_T_COUNT == 0 {
367            let lvt = lv + (u32::from(second) - HANGUL_T_BASE);
368            // Safe, because the inputs are known to be in range.
369            return Some(unsafe { char::from_u32_unchecked(HANGUL_S_BASE + lvt) });
370        }
371    }
372    None
373}
374
375/// Performs (non-Hangul) canonical composition on a pair of characters
376/// or returns `None` if these characters don't compose. Composition
377/// exclusions are taken into account.
378fn compose_non_hangul(mut iter: Char16TrieIterator, starter: char, second: char) -> Option<char> {
379    // To make the trie smaller, the pairs are stored second character first.
380    // Given how this method is used in ways where it's known that `second`
381    // is or isn't a starter. We could potentially split the trie into two
382    // tries depending on whether `second` is a starter.
383    match iter.next(second) {
384        TrieResult::NoMatch => None,
385        TrieResult::NoValue => match iter.next(starter) {
386            TrieResult::NoMatch => None,
387            TrieResult::FinalValue(i) => {
388                if let Some(c) = char::from_u32(i as u32) {
389                    Some(c)
390                } else {
391                    // GIGO case
392                    debug_assert!(false);
393                    None
394                }
395            }
396            TrieResult::NoValue | TrieResult::Intermediate(_) => {
397                // GIGO case
398                debug_assert!(false);
399                None
400            }
401        },
402        TrieResult::FinalValue(_) | TrieResult::Intermediate(_) => {
403            // GIGO case
404            debug_assert!(false);
405            None
406        }
407    }
408}
409
410/// See trie-value-format.md
411#[inline(always)]
412fn starter_and_decomposes_to_self_impl(trie_val: u32) -> bool {
413    // The REPLACEMENT CHARACTER has `NON_ROUND_TRIP_MARKER` set,
414    // and this function needs to ignore that.
415    (trie_val & !(BACKWARD_COMBINING_MARKER | NON_ROUND_TRIP_MARKER)) == 0
416}
417
418/// See trie-value-format.md
419#[inline(always)]
420fn potential_passthrough_and_cannot_combine_backwards_impl(trie_val: u32) -> bool {
421    (trie_val & (NON_ROUND_TRIP_MARKER | BACKWARD_COMBINING_MARKER)) == 0
422}
423
424/// Struct for holding together a character and the value
425/// looked up for it from the NFD trie in a more explicit
426/// way than an anonymous pair.
427/// Also holds a flag about the supplementary-trie provenance.
428#[derive(Debug, PartialEq, Eq)]
429struct CharacterAndTrieValue {
430    character: char,
431    /// See trie-value-format.md
432    trie_val: u32,
433}
434
435impl CharacterAndTrieValue {
436    #[inline(always)]
437    pub fn new(c: char, trie_value: u32) -> Self {
438        CharacterAndTrieValue {
439            character: c,
440            trie_val: trie_value,
441        }
442    }
443
444    #[inline(always)]
445    pub fn starter_and_decomposes_to_self(&self) -> bool {
446        starter_and_decomposes_to_self_impl(self.trie_val)
447    }
448
449    /// See trie-value-format.md
450    #[inline(always)]
451    #[cfg(feature = "utf8_iter")]
452    pub fn starter_and_decomposes_to_self_except_replacement(&self) -> bool {
453        // This intentionally leaves `NON_ROUND_TRIP_MARKER` in the value
454        // to be compared with zero. U+FFFD has that flag set despite really
455        // being being round-tripping in order to make UTF-8 errors
456        // ineligible for passthrough.
457        (self.trie_val & !BACKWARD_COMBINING_MARKER) == 0
458    }
459
460    /// See trie-value-format.md
461    #[inline(always)]
462    pub fn can_combine_backwards(&self) -> bool {
463        (self.trie_val & BACKWARD_COMBINING_MARKER) != 0
464    }
465    /// See trie-value-format.md
466    #[inline(always)]
467    pub fn potential_passthrough(&self) -> bool {
468        (self.trie_val & NON_ROUND_TRIP_MARKER) == 0
469    }
470    /// See trie-value-format.md
471    #[inline(always)]
472    pub fn potential_passthrough_and_cannot_combine_backwards(&self) -> bool {
473        potential_passthrough_and_cannot_combine_backwards_impl(self.trie_val)
474    }
475}
476
477/// Pack a `char` and a `CanonicalCombiningClass` in
478/// 32 bits (the former in the lower 24 bits and the
479/// latter in the high 8 bits). The latter can be
480/// initialized to 0xFF upon creation, in which case
481/// it can be actually set later by calling
482/// `set_ccc_from_trie_if_not_already_set`. This is
483/// a micro optimization to avoid the Canonical
484/// Combining Class trie lookup when there is only
485/// one combining character in a sequence. This type
486/// is intentionally non-`Copy` to get compiler help
487/// in making sure that the class is set on the
488/// instance on which it is intended to be set
489/// and not on a temporary copy.
490///
491/// Note that 0xFF is won't be assigned to an actual
492/// canonical combining class per definition D104
493/// in The Unicode Standard.
494//
495// NOTE: The Pernosco debugger has special knowledge
496// of this struct. Please do not change the bit layout
497// or the crate-module-qualified name of this struct
498// without coordination.
499#[derive(Debug)]
500struct CharacterAndClass(u32);
501
502impl CharacterAndClass {
503    pub fn new(c: char, ccc: CanonicalCombiningClass) -> Self {
504        CharacterAndClass(u32::from(c) | (u32::from(ccc.to_icu4c_value()) << 24))
505    }
506    pub fn new_with_placeholder(c: char) -> Self {
507        CharacterAndClass(u32::from(c) | ((0xFF) << 24))
508    }
509    pub fn new_with_trie_value(c_tv: CharacterAndTrieValue) -> Self {
510        Self::new(c_tv.character, ccc_from_trie_value(c_tv.trie_val))
511    }
512    pub fn new_starter(c: char) -> Self {
513        CharacterAndClass(u32::from(c))
514    }
515    /// This method must exist for Pernosco to apply its special rendering.
516    /// Also, this must not be dead code!
517    pub fn character(&self) -> char {
518        // Safe, because the low 24 bits came from a `char`
519        // originally.
520        unsafe { char::from_u32_unchecked(self.0 & 0xFFFFFF) }
521    }
522    /// This method must exist for Pernosco to apply its special rendering.
523    pub fn ccc(&self) -> CanonicalCombiningClass {
524        CanonicalCombiningClass::from_icu4c_value((self.0 >> 24) as u8)
525    }
526
527    pub fn character_and_ccc(&self) -> (char, CanonicalCombiningClass) {
528        (self.character(), self.ccc())
529    }
530    pub fn set_ccc_from_trie_if_not_already_set(&mut self, trie: &Trie) {
531        if self.0 >> 24 != 0xFF {
532            return;
533        }
534        let scalar = self.0 & 0xFFFFFF;
535        self.0 =
536            ((ccc_from_trie_value(trie.get32_u32(scalar)).to_icu4c_value() as u32) << 24) | scalar;
537    }
538}
539
540// This function exists as a borrow check helper.
541#[inline(always)]
542fn sort_slice_by_ccc(slice: &mut [CharacterAndClass], trie: &Trie) {
543    // We don't look up the canonical combining class for starters
544    // of for single combining characters between starters. When
545    // there's more than one combining character between starters,
546    // we look up the canonical combining class for each character
547    // exactly once.
548    if slice.len() < 2 {
549        return;
550    }
551    slice
552        .iter_mut()
553        .for_each(|cc| cc.set_ccc_from_trie_if_not_already_set(trie));
554    slice.sort_by_key(|cc| cc.ccc());
555}
556
557/// An iterator adaptor that turns an `Iterator` over `char` into
558/// a lazily-decomposed `char` sequence.
559#[derive(Debug)]
560pub struct Decomposition<'data, I>
561where
562    I: Iterator<Item = char>,
563{
564    delegate: I,
565    buffer: SmallVec<[CharacterAndClass; 17]>, // Enough to hold NFKD for U+FDFA
566    /// The index of the next item to be read from `buffer`.
567    /// The purpose if this index is to avoid having to move
568    /// the rest upon every read.
569    buffer_pos: usize,
570    // At the start of `next()` if not `None`, this is a pending unnormalized
571    // starter. When `Decomposition` appears alone, this is never a non-starter.
572    // However, when `Decomposition` appears inside a `Composition`, this
573    // may become a non-starter before `decomposing_next()` is called.
574    pending: Option<CharacterAndTrieValue>, // None at end of stream
575    // See trie-value-format.md
576    trie: &'data Trie<'data>,
577    scalars16: &'data ZeroSlice<u16>,
578    scalars24: &'data ZeroSlice<char>,
579    supplementary_scalars16: &'data ZeroSlice<u16>,
580    supplementary_scalars24: &'data ZeroSlice<char>,
581    /// The lowest character for which either of the following does
582    /// not hold:
583    /// 1. Decomposes to self.
584    /// 2. Decomposition starts with a non-starter
585    decomposition_passthrough_bound: u32, // never above 0xC0
586    ignorable_behavior: IgnorableBehavior, // Arguably should be a type parameter
587}
588
589impl<'data, I> Decomposition<'data, I>
590where
591    I: Iterator<Item = char>,
592{
593    /// Constructs a decomposing iterator adapter from a delegate
594    /// iterator and references to the necessary data, without
595    /// supplementary data.
596    ///
597    /// Use `DecomposingNormalizer::normalize_iter()` instead unless
598    /// there's a good reason to use this constructor directly.
599    ///
600    /// Public but hidden in order to be able to use this from the
601    /// collator.
602    #[doc(hidden)] // used in collator
603    pub fn new(
604        delegate: I,
605        decompositions: &'data DecompositionData,
606        tables: &'data DecompositionTables,
607    ) -> Self {
608        Self::new_with_supplements(
609            delegate,
610            decompositions,
611            tables,
612            None,
613            0xC0,
614            IgnorableBehavior::Unsupported,
615        )
616    }
617
618    /// Constructs a decomposing iterator adapter from a delegate
619    /// iterator and references to the necessary data, including
620    /// supplementary data.
621    ///
622    /// Use `DecomposingNormalizer::normalize_iter()` instead unless
623    /// there's a good reason to use this constructor directly.
624    fn new_with_supplements(
625        delegate: I,
626        decompositions: &'data DecompositionData,
627        tables: &'data DecompositionTables,
628        supplementary_tables: Option<&'data DecompositionTables>,
629        decomposition_passthrough_bound: u8,
630        ignorable_behavior: IgnorableBehavior,
631    ) -> Self {
632        let mut ret = Decomposition::<I> {
633            delegate,
634            buffer: SmallVec::new(), // Normalized
635            buffer_pos: 0,
636            // Initialize with a placeholder starter in case
637            // the real stream starts with a non-starter.
638            pending: Some(CharacterAndTrieValue::new('\u{FFFF}', 0)),
639            #[allow(clippy::useless_conversion, clippy::expect_used)] // Expectation always succeeds when untyped tries are in use
640            trie: <&Trie>::try_from(&decompositions.trie).expect("Unexpected trie type in data"),
641            scalars16: &tables.scalars16,
642            scalars24: &tables.scalars24,
643            supplementary_scalars16: if let Some(supplementary) = supplementary_tables {
644                &supplementary.scalars16
645            } else {
646                EMPTY_U16
647            },
648            supplementary_scalars24: if let Some(supplementary) = supplementary_tables {
649                &supplementary.scalars24
650            } else {
651                EMPTY_CHAR
652            },
653            decomposition_passthrough_bound: u32::from(decomposition_passthrough_bound),
654            ignorable_behavior,
655        };
656        let _ = ret.next(); // Remove the U+FFFF placeholder
657        ret
658    }
659
660    fn push_decomposition16(
661        &mut self,
662        offset: usize,
663        len: usize,
664        only_non_starters_in_trail: bool,
665        slice16: &ZeroSlice<u16>,
666    ) -> (char, usize) {
667        let (starter, tail) = slice16
668            .get_subslice(offset..offset + len)
669            .and_then(|slice| slice.split_first())
670            .map_or_else(
671                || {
672                    // GIGO case
673                    debug_assert!(false);
674                    (REPLACEMENT_CHARACTER, EMPTY_U16)
675                },
676                |(first, trail)| (char_from_u16(first), trail),
677            );
678        if only_non_starters_in_trail {
679            // All the rest are combining
680            self.buffer.extend(
681                tail.iter()
682                    .map(|u| CharacterAndClass::new_with_placeholder(char_from_u16(u))),
683            );
684            (starter, 0)
685        } else {
686            let mut i = 0;
687            let mut combining_start = 0;
688            for u in tail.iter() {
689                let ch = char_from_u16(u);
690                let trie_value = self.trie.get(ch);
691                self.buffer.push(CharacterAndClass::new_with_trie_value(
692                    CharacterAndTrieValue::new(ch, trie_value),
693                ));
694                i += 1;
695                // Half-width kana and iota subscript don't occur in the tails
696                // of these multicharacter decompositions.
697                if !decomposition_starts_with_non_starter(trie_value) {
698                    combining_start = i;
699                }
700            }
701            (starter, combining_start)
702        }
703    }
704
705    fn push_decomposition32(
706        &mut self,
707        offset: usize,
708        len: usize,
709        only_non_starters_in_trail: bool,
710        slice32: &ZeroSlice<char>,
711    ) -> (char, usize) {
712        let (starter, tail) = slice32
713            .get_subslice(offset..offset + len)
714            .and_then(|slice| slice.split_first())
715            .unwrap_or_else(|| {
716                // GIGO case
717                debug_assert!(false);
718                (REPLACEMENT_CHARACTER, EMPTY_CHAR)
719            });
720        if only_non_starters_in_trail {
721            // All the rest are combining
722            self.buffer
723                .extend(tail.iter().map(CharacterAndClass::new_with_placeholder));
724            (starter, 0)
725        } else {
726            let mut i = 0;
727            let mut combining_start = 0;
728            for ch in tail.iter() {
729                let trie_value = self.trie.get(ch);
730                self.buffer.push(CharacterAndClass::new_with_trie_value(
731                    CharacterAndTrieValue::new(ch, trie_value),
732                ));
733                i += 1;
734                // Half-width kana and iota subscript don't occur in the tails
735                // of these multicharacter decompositions.
736                if !decomposition_starts_with_non_starter(trie_value) {
737                    combining_start = i;
738                }
739            }
740            (starter, combining_start)
741        }
742    }
743
744    #[inline(always)]
745    fn attach_trie_value(&self, c: char) -> CharacterAndTrieValue {
746        CharacterAndTrieValue::new(c, self.trie.get(c))
747    }
748
749    fn delegate_next_no_pending(&mut self) -> Option<CharacterAndTrieValue> {
750        debug_assert!(self.pending.is_none());
751        loop {
752            let c = self.delegate.next()?;
753
754            // TODO(#2384): Measure if this check is actually an optimization.
755            if u32::from(c) < self.decomposition_passthrough_bound {
756                return Some(CharacterAndTrieValue::new(c, 0));
757            }
758
759            let trie_val = self.trie.get(c);
760            // TODO: Can we do something better about the cost of this branch in the
761            // non-UTS 46 case?
762            if trie_val == IGNORABLE_MARKER {
763                match self.ignorable_behavior {
764                    IgnorableBehavior::Unsupported => {
765                        debug_assert!(false);
766                    }
767                    IgnorableBehavior::ReplacementCharacter => {
768                        return Some(CharacterAndTrieValue::new(
769                            c,
770                            u32::from(REPLACEMENT_CHARACTER) | NON_ROUND_TRIP_MARKER,
771                        ));
772                    }
773                    IgnorableBehavior::Ignored => {
774                        // Else ignore this character by reading the next one from the delegate.
775                        continue;
776                    }
777                }
778            }
779            return Some(CharacterAndTrieValue::new(c, trie_val));
780        }
781    }
782
783    fn delegate_next(&mut self) -> Option<CharacterAndTrieValue> {
784        if let Some(pending) = self.pending.take() {
785            // Only happens as part of `Composition` and as part of
786            // the contiguous-buffer methods of `DecomposingNormalizer`.
787            // I.e. does not happen as part of standalone iterator
788            // usage of `Decomposition`.
789            Some(pending)
790        } else {
791            self.delegate_next_no_pending()
792        }
793    }
794
795    fn decomposing_next(&mut self, c_and_trie_val: CharacterAndTrieValue) -> char {
796        let (starter, combining_start) = {
797            let c = c_and_trie_val.character;
798            // See trie-value-format.md
799            let decomposition = c_and_trie_val.trie_val;
800            // The REPLACEMENT CHARACTER has `NON_ROUND_TRIP_MARKER` set,
801            // and that flag needs to be ignored here.
802            if (decomposition & !(BACKWARD_COMBINING_MARKER | NON_ROUND_TRIP_MARKER)) == 0 {
803                // The character is its own decomposition
804                (c, 0)
805            } else {
806                let high_zeros = (decomposition & HIGH_ZEROS_MASK) == 0;
807                let low_zeros = (decomposition & LOW_ZEROS_MASK) == 0;
808                if !high_zeros && !low_zeros {
809                    // Decomposition into two BMP characters: starter and non-starter
810                    let starter = char_from_u32(decomposition & 0x7FFF);
811                    let combining = char_from_u32((decomposition >> 15) & 0x7FFF);
812                    self.buffer
813                        .push(CharacterAndClass::new_with_placeholder(combining));
814                    (starter, 0)
815                } else if high_zeros {
816                    // Do the check by looking at `c` instead of looking at a marker
817                    // in `singleton` below, because if we looked at the trie value,
818                    // we'd still have to check that `c` is in the Hangul syllable
819                    // range in order for the subsequent interpretations as `char`
820                    // to be safe.
821                    // Alternatively, `FDFA_MARKER` and the Hangul marker could
822                    // be unified. That would add a branch for Hangul and remove
823                    // a branch from singleton decompositions. It seems more
824                    // important to favor Hangul syllables than singleton
825                    // decompositions.
826                    // Note that it would be valid to hoist this Hangul check
827                    // one or even two steps earlier in this check hierarchy.
828                    // Right now, it's assumed the kind of decompositions into
829                    // BMP starter and non-starter, which occur in many languages,
830                    // should be checked before Hangul syllables, which are about
831                    // one language specifically. Hopefully, we get some
832                    // instruction-level parallelism out of the disjointness of
833                    // operations on `c` and `decomposition`.
834                    let hangul_offset = u32::from(c).wrapping_sub(HANGUL_S_BASE); // SIndex in the spec
835                    if hangul_offset < HANGUL_S_COUNT {
836                        debug_assert_eq!(decomposition, 1);
837                        // Hangul syllable
838                        // The math here comes from page 144 of Unicode 14.0
839                        let l = hangul_offset / HANGUL_N_COUNT;
840                        let v = (hangul_offset % HANGUL_N_COUNT) / HANGUL_T_COUNT;
841                        let t = hangul_offset % HANGUL_T_COUNT;
842
843                        // The unsafe blocks here are OK, because the values stay
844                        // within the Hangul jamo block and, therefore, the scalar
845                        // value range by construction.
846                        self.buffer.push(CharacterAndClass::new_starter(unsafe {
847                            core::char::from_u32_unchecked(HANGUL_V_BASE + v)
848                        }));
849                        let first = unsafe { core::char::from_u32_unchecked(HANGUL_L_BASE + l) };
850                        if t != 0 {
851                            self.buffer.push(CharacterAndClass::new_starter(unsafe {
852                                core::char::from_u32_unchecked(HANGUL_T_BASE + t)
853                            }));
854                            (first, 2)
855                        } else {
856                            (first, 1)
857                        }
858                    } else {
859                        let singleton = decomposition as u16;
860                        if singleton != FDFA_MARKER {
861                            // Decomposition into one BMP character
862                            let starter = char_from_u16(singleton);
863                            (starter, 0)
864                        } else {
865                            // Special case for the NFKD form of U+FDFA.
866                            self.buffer.extend(FDFA_NFKD.map(|u| {
867                                // SAFETY: `FDFA_NFKD` is known not to contain
868                                // surrogates.
869                                CharacterAndClass::new_starter(unsafe {
870                                    core::char::from_u32_unchecked(u32::from(u))
871                                })
872                            }));
873                            ('\u{0635}', 17)
874                        }
875                    }
876                } else {
877                    debug_assert!(low_zeros);
878                    // Only 12 of 14 bits used as of Unicode 16.
879                    let offset = (((decomposition & !(0b11 << 30)) >> 16) as usize) - 1;
880                    // Only 3 of 4 bits used as of Unicode 16.
881                    let len_bits = decomposition & 0b1111;
882                    let only_non_starters_in_trail = (decomposition & 0b10000) != 0;
883                    if offset < self.scalars16.len() {
884                        self.push_decomposition16(
885                            offset,
886                            (len_bits + 2) as usize,
887                            only_non_starters_in_trail,
888                            self.scalars16,
889                        )
890                    } else if offset < self.scalars16.len() + self.scalars24.len() {
891                        self.push_decomposition32(
892                            offset - self.scalars16.len(),
893                            (len_bits + 1) as usize,
894                            only_non_starters_in_trail,
895                            self.scalars24,
896                        )
897                    } else if offset
898                        < self.scalars16.len()
899                            + self.scalars24.len()
900                            + self.supplementary_scalars16.len()
901                    {
902                        self.push_decomposition16(
903                            offset - (self.scalars16.len() + self.scalars24.len()),
904                            (len_bits + 2) as usize,
905                            only_non_starters_in_trail,
906                            self.supplementary_scalars16,
907                        )
908                    } else {
909                        self.push_decomposition32(
910                            offset
911                                - (self.scalars16.len()
912                                    + self.scalars24.len()
913                                    + self.supplementary_scalars16.len()),
914                            (len_bits + 1) as usize,
915                            only_non_starters_in_trail,
916                            self.supplementary_scalars24,
917                        )
918                    }
919                }
920            }
921        };
922        // Either we're inside `Composition` or `self.pending.is_none()`.
923
924        self.gather_and_sort_combining(combining_start);
925        starter
926    }
927
928    fn gather_and_sort_combining(&mut self, combining_start: usize) {
929        // Not a `for` loop to avoid holding a mutable reference to `self` across
930        // the loop body.
931        while let Some(ch_and_trie_val) = self.delegate_next() {
932            if !trie_value_has_ccc(ch_and_trie_val.trie_val) {
933                self.pending = Some(ch_and_trie_val);
934                break;
935            } else if !trie_value_indicates_special_non_starter_decomposition(
936                ch_and_trie_val.trie_val,
937            ) {
938                self.buffer
939                    .push(CharacterAndClass::new_with_trie_value(ch_and_trie_val));
940            } else {
941                // The Tibetan special cases are starters that decompose into non-starters.
942                let mapped = match ch_and_trie_val.character {
943                    '\u{0340}' => {
944                        // COMBINING GRAVE TONE MARK
945                        CharacterAndClass::new('\u{0300}', CCC_ABOVE)
946                    }
947                    '\u{0341}' => {
948                        // COMBINING ACUTE TONE MARK
949                        CharacterAndClass::new('\u{0301}', CCC_ABOVE)
950                    }
951                    '\u{0343}' => {
952                        // COMBINING GREEK KORONIS
953                        CharacterAndClass::new('\u{0313}', CCC_ABOVE)
954                    }
955                    '\u{0344}' => {
956                        // COMBINING GREEK DIALYTIKA TONOS
957                        self.buffer
958                            .push(CharacterAndClass::new('\u{0308}', CCC_ABOVE));
959                        CharacterAndClass::new('\u{0301}', CCC_ABOVE)
960                    }
961                    '\u{0F73}' => {
962                        // TIBETAN VOWEL SIGN II
963                        self.buffer
964                            .push(CharacterAndClass::new('\u{0F71}', ccc!(CCC129, 129)));
965                        CharacterAndClass::new('\u{0F72}', ccc!(CCC130, 130))
966                    }
967                    '\u{0F75}' => {
968                        // TIBETAN VOWEL SIGN UU
969                        self.buffer
970                            .push(CharacterAndClass::new('\u{0F71}', ccc!(CCC129, 129)));
971                        CharacterAndClass::new('\u{0F74}', ccc!(CCC132, 132))
972                    }
973                    '\u{0F81}' => {
974                        // TIBETAN VOWEL SIGN REVERSED II
975                        self.buffer
976                            .push(CharacterAndClass::new('\u{0F71}', ccc!(CCC129, 129)));
977                        CharacterAndClass::new('\u{0F80}', ccc!(CCC130, 130))
978                    }
979                    '\u{FF9E}' => {
980                        // HALFWIDTH KATAKANA VOICED SOUND MARK
981                        CharacterAndClass::new('\u{3099}', ccc!(KanaVoicing, 8))
982                    }
983                    '\u{FF9F}' => {
984                        // HALFWIDTH KATAKANA VOICED SOUND MARK
985                        CharacterAndClass::new('\u{309A}', ccc!(KanaVoicing, 8))
986                    }
987                    _ => {
988                        // GIGO case
989                        debug_assert!(false);
990                        CharacterAndClass::new_with_placeholder(REPLACEMENT_CHARACTER)
991                    }
992                };
993                self.buffer.push(mapped);
994            }
995        }
996        // Slicing succeeds by construction; we've always ensured that `combining_start`
997        // is in permissible range.
998        #[expect(clippy::indexing_slicing)]
999        sort_slice_by_ccc(&mut self.buffer[combining_start..], self.trie);
1000    }
1001}
1002
1003impl<I> Iterator for Decomposition<'_, I>
1004where
1005    I: Iterator<Item = char>,
1006{
1007    type Item = char;
1008
1009    fn next(&mut self) -> Option<char> {
1010        if let Some(ret) = self.buffer.get(self.buffer_pos).map(|c| c.character()) {
1011            self.buffer_pos += 1;
1012            if self.buffer_pos == self.buffer.len() {
1013                self.buffer.clear();
1014                self.buffer_pos = 0;
1015            }
1016            return Some(ret);
1017        }
1018        debug_assert_eq!(self.buffer_pos, 0);
1019        let c_and_trie_val = self.pending.take()?;
1020        Some(self.decomposing_next(c_and_trie_val))
1021    }
1022}
1023
1024/// An iterator adaptor that turns an `Iterator` over `char` into
1025/// a lazily-decomposed and then canonically composed `char` sequence.
1026#[derive(Debug)]
1027pub struct Composition<'data, I>
1028where
1029    I: Iterator<Item = char>,
1030{
1031    /// The decomposing part of the normalizer than operates before
1032    /// the canonical composition is performed on its output.
1033    decomposition: Decomposition<'data, I>,
1034    /// Non-Hangul canonical composition data.
1035    canonical_compositions: Char16Trie<'data>,
1036    /// To make `next()` yield in cases where there's a non-composing
1037    /// starter in the decomposition buffer, we put it here to let it
1038    /// wait for the next `next()` call (or a jump forward within the
1039    /// `next()` call).
1040    unprocessed_starter: Option<char>,
1041    /// The lowest character for which any one of the following does
1042    /// not hold:
1043    /// 1. Roundtrips via decomposition and recomposition.
1044    /// 2. Decomposition starts with a non-starter
1045    /// 3. Is not a backward-combining starter
1046    composition_passthrough_bound: u32,
1047}
1048
1049impl<'data, I> Composition<'data, I>
1050where
1051    I: Iterator<Item = char>,
1052{
1053    fn new(
1054        decomposition: Decomposition<'data, I>,
1055        canonical_compositions: Char16Trie<'data>,
1056        composition_passthrough_bound: u16,
1057    ) -> Self {
1058        Self {
1059            decomposition,
1060            canonical_compositions,
1061            unprocessed_starter: None,
1062            composition_passthrough_bound: u32::from(composition_passthrough_bound),
1063        }
1064    }
1065
1066    /// Performs canonical composition (including Hangul) on a pair of
1067    /// characters or returns `None` if these characters don't compose.
1068    /// Composition exclusions are taken into account.
1069    #[inline(always)]
1070    pub fn compose(&self, starter: char, second: char) -> Option<char> {
1071        compose(self.canonical_compositions.iter(), starter, second)
1072    }
1073
1074    /// Performs (non-Hangul) canonical composition on a pair of characters
1075    /// or returns `None` if these characters don't compose. Composition
1076    /// exclusions are taken into account.
1077    #[inline(always)]
1078    fn compose_non_hangul(&self, starter: char, second: char) -> Option<char> {
1079        compose_non_hangul(self.canonical_compositions.iter(), starter, second)
1080    }
1081}
1082
1083impl<I> Iterator for Composition<'_, I>
1084where
1085    I: Iterator<Item = char>,
1086{
1087    type Item = char;
1088
1089    #[inline]
1090    fn next(&mut self) -> Option<char> {
1091        let mut undecomposed_starter = CharacterAndTrieValue::new('\u{0}', 0); // The compiler can't figure out that this gets overwritten before use.
1092        if self.unprocessed_starter.is_none() {
1093            // The loop is only broken out of as goto forward
1094            #[expect(clippy::never_loop)]
1095            loop {
1096                if let Some((character, ccc)) = self
1097                    .decomposition
1098                    .buffer
1099                    .get(self.decomposition.buffer_pos)
1100                    .map(|c| c.character_and_ccc())
1101                {
1102                    self.decomposition.buffer_pos += 1;
1103                    if self.decomposition.buffer_pos == self.decomposition.buffer.len() {
1104                        self.decomposition.buffer.clear();
1105                        self.decomposition.buffer_pos = 0;
1106                    }
1107                    if ccc == CCC_NOT_REORDERED {
1108                        // Previous decomposition contains a starter. This must
1109                        // now become the `unprocessed_starter` for it to have
1110                        // a chance to compose with the upcoming characters.
1111                        //
1112                        // E.g. parenthesized Hangul in NFKC comes through here,
1113                        // but suitable composition exclusion could exercise this
1114                        // in NFC.
1115                        self.unprocessed_starter = Some(character);
1116                        break; // We already have a starter, so skip taking one from `pending`.
1117                    }
1118                    return Some(character);
1119                }
1120                debug_assert_eq!(self.decomposition.buffer_pos, 0);
1121                undecomposed_starter = self.decomposition.pending.take()?;
1122                if u32::from(undecomposed_starter.character) < self.composition_passthrough_bound
1123                    || undecomposed_starter.potential_passthrough()
1124                {
1125                    // TODO(#2385): In the NFC case (moot for NFKC and UTS46), if the upcoming
1126                    // character is not below `decomposition_passthrough_bound` but is
1127                    // below `composition_passthrough_bound`, we read from the trie
1128                    // unnecessarily.
1129                    if let Some(upcoming) = self.decomposition.delegate_next_no_pending() {
1130                        let cannot_combine_backwards = u32::from(upcoming.character)
1131                            < self.composition_passthrough_bound
1132                            || !upcoming.can_combine_backwards();
1133                        self.decomposition.pending = Some(upcoming);
1134                        if cannot_combine_backwards {
1135                            // Fast-track succeeded!
1136                            return Some(undecomposed_starter.character);
1137                        }
1138                    } else {
1139                        // End of stream
1140                        return Some(undecomposed_starter.character);
1141                    }
1142                }
1143                break; // Not actually looping
1144            }
1145        }
1146        let mut starter = '\u{0}'; // The compiler can't figure out this gets overwritten before use.
1147
1148        // The point of having this boolean is to have only one call site to
1149        // `self.decomposition.decomposing_next`, which is hopefully beneficial for
1150        // code size under inlining.
1151        let mut attempt_composition = false;
1152        loop {
1153            if let Some(unprocessed) = self.unprocessed_starter.take() {
1154                debug_assert_eq!(undecomposed_starter, CharacterAndTrieValue::new('\u{0}', 0));
1155                debug_assert_eq!(starter, '\u{0}');
1156                starter = unprocessed;
1157            } else {
1158                debug_assert_eq!(self.decomposition.buffer_pos, 0);
1159                let next_starter = self.decomposition.decomposing_next(undecomposed_starter);
1160                if !attempt_composition {
1161                    starter = next_starter;
1162                } else if let Some(composed) = self.compose(starter, next_starter) {
1163                    starter = composed;
1164                } else {
1165                    // This is our yield point. We'll pick this up above in the
1166                    // next call to `next()`.
1167                    self.unprocessed_starter = Some(next_starter);
1168                    return Some(starter);
1169                }
1170            }
1171            // We first loop by index to avoid moving the contents of `buffer`, but
1172            // if there's a discontiguous match, we'll start modifying `buffer` instead.
1173            loop {
1174                let (character, ccc) = if let Some((character, ccc)) = self
1175                    .decomposition
1176                    .buffer
1177                    .get(self.decomposition.buffer_pos)
1178                    .map(|c| c.character_and_ccc())
1179                {
1180                    (character, ccc)
1181                } else {
1182                    self.decomposition.buffer.clear();
1183                    self.decomposition.buffer_pos = 0;
1184                    break;
1185                };
1186                if let Some(composed) = self.compose(starter, character) {
1187                    starter = composed;
1188                    self.decomposition.buffer_pos += 1;
1189                    continue;
1190                }
1191                let mut most_recent_skipped_ccc = ccc;
1192                {
1193                    let _ = self
1194                        .decomposition
1195                        .buffer
1196                        .drain(0..self.decomposition.buffer_pos);
1197                }
1198                self.decomposition.buffer_pos = 0;
1199                if most_recent_skipped_ccc == CCC_NOT_REORDERED {
1200                    // We failed to compose a starter. Discontiguous match not allowed.
1201                    // We leave the starter in `buffer` for `next()` to find.
1202                    return Some(starter);
1203                }
1204                let mut i = 1; // We have skipped one non-starter.
1205                while let Some((character, ccc)) = self
1206                    .decomposition
1207                    .buffer
1208                    .get(i)
1209                    .map(|c| c.character_and_ccc())
1210                {
1211                    if ccc == CCC_NOT_REORDERED {
1212                        // Discontiguous match not allowed.
1213                        return Some(starter);
1214                    }
1215                    debug_assert!(ccc >= most_recent_skipped_ccc);
1216                    if ccc != most_recent_skipped_ccc {
1217                        // Using the non-Hangul version as a micro-optimization, since
1218                        // we already rejected the case where `second` is a starter
1219                        // above, and conjoining jamo are starters.
1220                        if let Some(composed) = self.compose_non_hangul(starter, character) {
1221                            self.decomposition.buffer.remove(i);
1222                            starter = composed;
1223                            continue;
1224                        }
1225                    }
1226                    most_recent_skipped_ccc = ccc;
1227                    i += 1;
1228                }
1229                break;
1230            }
1231
1232            debug_assert_eq!(self.decomposition.buffer_pos, 0);
1233
1234            if !self.decomposition.buffer.is_empty() {
1235                return Some(starter);
1236            }
1237            // Now we need to check if composition with an upcoming starter is possible.
1238            #[expect(clippy::unwrap_used)]
1239            if self.decomposition.pending.is_some() {
1240                // We know that `pending_starter` decomposes to start with a starter.
1241                // Otherwise, it would have been moved to `self.decomposition.buffer`
1242                // by `self.decomposing_next()`. We do this set lookup here in order
1243                // to get an opportunity to go back to the fast track.
1244                // Note that this check has to happen _after_ checking that `pending`
1245                // holds a character, because this flag isn't defined to be meaningful
1246                // when `pending` isn't holding a character.
1247                let pending = self.decomposition.pending.as_ref().unwrap();
1248                if u32::from(pending.character) < self.composition_passthrough_bound
1249                    || !pending.can_combine_backwards()
1250                {
1251                    // Won't combine backwards anyway.
1252                    return Some(starter);
1253                }
1254                // Consume what we peeked. `unwrap` OK, because we checked `is_some()`
1255                // above.
1256                undecomposed_starter = self.decomposition.pending.take().unwrap();
1257                // The following line is OK, because we're about to loop back
1258                // to `self.decomposition.decomposing_next(c);`, which will
1259                // restore the between-`next()`-calls invariant of `pending`
1260                // before this function returns.
1261                attempt_composition = true;
1262                continue;
1263            }
1264            // End of input
1265            return Some(starter);
1266        }
1267    }
1268}
1269
1270macro_rules! composing_normalize_to {
1271    ($(#[$meta:meta])*,
1272     $normalize_to:ident,
1273     $write:path,
1274     $slice:ty,
1275     $prolog:block,
1276     $always_valid_utf:literal,
1277     $as_slice:ident,
1278     $fast:block,
1279     $text:ident,
1280     $sink:ident,
1281     $composition:ident,
1282     $composition_passthrough_bound:ident,
1283     $undecomposed_starter:ident,
1284     $pending_slice:ident,
1285     $len_utf:ident,
1286    ) => {
1287        $(#[$meta])*
1288        pub fn $normalize_to<W: $write + ?Sized>(
1289            &self,
1290            $text: $slice,
1291            $sink: &mut W,
1292        ) -> core::fmt::Result {
1293            $prolog
1294            let mut $composition = self.normalize_iter($text.chars());
1295            debug_assert_eq!($composition.decomposition.ignorable_behavior, IgnorableBehavior::Unsupported);
1296            for cc in $composition.decomposition.buffer.drain(..) {
1297                $sink.write_char(cc.character())?;
1298            }
1299
1300            // Try to get the compiler to hoist the bound to a register.
1301            let $composition_passthrough_bound = $composition.composition_passthrough_bound;
1302            'outer: loop {
1303                debug_assert_eq!($composition.decomposition.buffer_pos, 0);
1304                let mut $undecomposed_starter =
1305                    if let Some(pending) = $composition.decomposition.pending.take() {
1306                        pending
1307                    } else {
1308                        return Ok(());
1309                    };
1310                if u32::from($undecomposed_starter.character) < $composition_passthrough_bound ||
1311                    $undecomposed_starter.potential_passthrough()
1312                {
1313                    // We don't know if a `REPLACEMENT_CHARACTER` occurred in the slice or
1314                    // was returned in response to an error by the iterator. Assume the
1315                    // latter for correctness even though it pessimizes the former.
1316                    if $always_valid_utf || $undecomposed_starter.character != REPLACEMENT_CHARACTER {
1317                        let $pending_slice = &$text[$text.len() - $composition.decomposition.delegate.$as_slice().len() - $undecomposed_starter.character.$len_utf()..];
1318                        // The `$fast` block must either:
1319                        // 1. Return due to reaching EOF
1320                        // 2. Leave a starter with its trie value in `$undecomposed_starter`
1321                        //    and, if there is still more input, leave the next character
1322                        //    and its trie value in `$composition.decomposition.pending`.
1323                        $fast
1324                    }
1325                }
1326                // Fast track above, full algorithm below
1327                let mut starter = $composition
1328                    .decomposition
1329                    .decomposing_next($undecomposed_starter);
1330                'bufferloop: loop {
1331                    // We first loop by index to avoid moving the contents of `buffer`, but
1332                    // if there's a discontiguous match, we'll start modifying `buffer` instead.
1333                    loop {
1334                        let (character, ccc) = if let Some((character, ccc)) = $composition
1335                            .decomposition
1336                            .buffer
1337                            .get($composition.decomposition.buffer_pos)
1338                            .map(|c| c.character_and_ccc())
1339                        {
1340                            (character, ccc)
1341                        } else {
1342                            $composition.decomposition.buffer.clear();
1343                            $composition.decomposition.buffer_pos = 0;
1344                            break;
1345                        };
1346                        if let Some(composed) = $composition.compose(starter, character) {
1347                            starter = composed;
1348                            $composition.decomposition.buffer_pos += 1;
1349                            continue;
1350                        }
1351                        let mut most_recent_skipped_ccc = ccc;
1352                        if most_recent_skipped_ccc == CCC_NOT_REORDERED {
1353                            // We failed to compose a starter. Discontiguous match not allowed.
1354                            // Write the current `starter` we've been composing, make the unmatched
1355                            // starter in the buffer the new `starter` (we know it's been decomposed)
1356                            // and process the rest of the buffer with that as the starter.
1357                            $sink.write_char(starter)?;
1358                            starter = character;
1359                            $composition.decomposition.buffer_pos += 1;
1360                            continue 'bufferloop;
1361                        } else {
1362                            {
1363                                let _ = $composition
1364                                    .decomposition
1365                                    .buffer
1366                                    .drain(0..$composition.decomposition.buffer_pos);
1367                            }
1368                            $composition.decomposition.buffer_pos = 0;
1369                        }
1370                        let mut i = 1; // We have skipped one non-starter.
1371                        while let Some((character, ccc)) = $composition
1372                            .decomposition
1373                            .buffer
1374                            .get(i)
1375                            .map(|c| c.character_and_ccc())
1376                        {
1377                            if ccc == CCC_NOT_REORDERED {
1378                                // Discontiguous match not allowed.
1379                                $sink.write_char(starter)?;
1380                                for cc in $composition.decomposition.buffer.drain(..i) {
1381                                    $sink.write_char(cc.character())?;
1382                                }
1383                                starter = character;
1384                                {
1385                                    let removed = $composition.decomposition.buffer.remove(0);
1386                                    debug_assert_eq!(starter, removed.character());
1387                                }
1388                                debug_assert_eq!($composition.decomposition.buffer_pos, 0);
1389                                continue 'bufferloop;
1390                            }
1391                            debug_assert!(ccc >= most_recent_skipped_ccc);
1392                            if ccc != most_recent_skipped_ccc {
1393                                // Using the non-Hangul version as a micro-optimization, since
1394                                // we already rejected the case where `second` is a starter
1395                                // above, and conjoining jamo are starters.
1396                                if let Some(composed) =
1397                                    $composition.compose_non_hangul(starter, character)
1398                                {
1399                                    $composition.decomposition.buffer.remove(i);
1400                                    starter = composed;
1401                                    continue;
1402                                }
1403                            }
1404                            most_recent_skipped_ccc = ccc;
1405                            i += 1;
1406                        }
1407                        break;
1408                    }
1409                    debug_assert_eq!($composition.decomposition.buffer_pos, 0);
1410
1411                    if !$composition.decomposition.buffer.is_empty() {
1412                        $sink.write_char(starter)?;
1413                        for cc in $composition.decomposition.buffer.drain(..) {
1414                            $sink.write_char(cc.character())?;
1415                        }
1416                        // We had non-empty buffer, so can't compose with upcoming.
1417                        continue 'outer;
1418                    }
1419                    // Now we need to check if composition with an upcoming starter is possible.
1420                    if $composition.decomposition.pending.is_some() {
1421                        // We know that `pending_starter` decomposes to start with a starter.
1422                        // Otherwise, it would have been moved to `composition.decomposition.buffer`
1423                        // by `composition.decomposing_next()`. We do this set lookup here in order
1424                        // to get an opportunity to go back to the fast track.
1425                        // Note that this check has to happen _after_ checking that `pending`
1426                        // holds a character, because this flag isn't defined to be meaningful
1427                        // when `pending` isn't holding a character.
1428                        let pending = $composition.decomposition.pending.as_ref().unwrap();
1429                        if u32::from(pending.character) < $composition.composition_passthrough_bound
1430                            || !pending.can_combine_backwards()
1431                        {
1432                            // Won't combine backwards anyway.
1433                            $sink.write_char(starter)?;
1434                            continue 'outer;
1435                        }
1436                        let pending_starter = $composition.decomposition.pending.take().unwrap();
1437                        let decomposed = $composition.decomposition.decomposing_next(pending_starter);
1438                        if let Some(composed) = $composition.compose(starter, decomposed) {
1439                            starter = composed;
1440                        } else {
1441                            $sink.write_char(starter)?;
1442                            starter = decomposed;
1443                        }
1444                        continue 'bufferloop;
1445                    }
1446                    // End of input
1447                    $sink.write_char(starter)?;
1448                    return Ok(());
1449                } // 'bufferloop
1450            }
1451        }
1452    };
1453}
1454
1455macro_rules! decomposing_normalize_to {
1456    ($(#[$meta:meta])*,
1457     $normalize_to:ident,
1458     $write:path,
1459     $slice:ty,
1460     $prolog:block,
1461     $as_slice:ident,
1462     $fast:block,
1463     $text:ident,
1464     $sink:ident,
1465     $decomposition:ident,
1466     $decomposition_passthrough_bound:ident,
1467     $undecomposed_starter:ident,
1468     $pending_slice:ident,
1469     $outer:lifetime, // loop labels use lifetime tokens
1470    ) => {
1471        $(#[$meta])*
1472        pub fn $normalize_to<W: $write + ?Sized>(
1473            &self,
1474            $text: $slice,
1475            $sink: &mut W,
1476        ) -> core::fmt::Result {
1477            $prolog
1478
1479            let mut $decomposition = self.normalize_iter($text.chars());
1480            debug_assert_eq!($decomposition.ignorable_behavior, IgnorableBehavior::Unsupported);
1481
1482            // Try to get the compiler to hoist the bound to a register.
1483            let $decomposition_passthrough_bound = $decomposition.decomposition_passthrough_bound;
1484            $outer: loop {
1485                for cc in $decomposition.buffer.drain(..) {
1486                    $sink.write_char(cc.character())?;
1487                }
1488                debug_assert_eq!($decomposition.buffer_pos, 0);
1489                let mut $undecomposed_starter = if let Some(pending) = $decomposition.pending.take() {
1490                    pending
1491                } else {
1492                    return Ok(());
1493                };
1494                if $undecomposed_starter.starter_and_decomposes_to_self() {
1495                    // Don't bother including `undecomposed_starter` in a contiguous buffer
1496                    // write: Just write it right away:
1497                    $sink.write_char($undecomposed_starter.character)?;
1498
1499                    let $pending_slice = $decomposition.delegate.$as_slice();
1500                    $fast
1501                }
1502                let starter = $decomposition.decomposing_next($undecomposed_starter);
1503                $sink.write_char(starter)?;
1504            }
1505        }
1506    };
1507}
1508
1509macro_rules! normalizer_methods {
1510    () => {
1511        /// Normalize a string slice into a `Cow<'a, str>`.
1512        pub fn normalize<'a>(&self, text: &'a str) -> Cow<'a, str> {
1513            let (head, tail) = self.split_normalized(text);
1514            if tail.is_empty() {
1515                return Cow::Borrowed(head);
1516            }
1517            let mut ret = String::new();
1518            ret.reserve(text.len());
1519            ret.push_str(head);
1520            let _ = self.normalize_to(tail, &mut ret);
1521            Cow::Owned(ret)
1522        }
1523
1524        /// Split a string slice into maximum normalized prefix and unnormalized suffix
1525        /// such that the concatenation of the prefix and the normalization of the suffix
1526        /// is the normalization of the whole input.
1527        pub fn split_normalized<'a>(&self, text: &'a str) -> (&'a str, &'a str) {
1528            let up_to = self.is_normalized_up_to(text);
1529            text.split_at_checked(up_to).unwrap_or_else(|| {
1530                // Internal bug, not even GIGO, never supposed to happen
1531                debug_assert!(false);
1532                ("", text)
1533            })
1534        }
1535
1536        /// Return the index a string slice is normalized up to.
1537        fn is_normalized_up_to(&self, text: &str) -> usize {
1538            let mut sink = IsNormalizedSinkStr::new(text);
1539            let _ = self.normalize_to(text, &mut sink);
1540            text.len() - sink.remaining_len()
1541        }
1542
1543        /// Check whether a string slice is normalized.
1544        pub fn is_normalized(&self, text: &str) -> bool {
1545            self.is_normalized_up_to(text) == text.len()
1546        }
1547
1548        /// Normalize a slice of potentially-invalid UTF-16 into a `Cow<'a, [u16]>`.
1549        ///
1550        /// Unpaired surrogates are mapped to the REPLACEMENT CHARACTER
1551        /// before normalizing.
1552        ///
1553        /// ✨ *Enabled with the `utf16_iter` Cargo feature.*
1554        #[cfg(feature = "utf16_iter")]
1555        pub fn normalize_utf16<'a>(&self, text: &'a [u16]) -> Cow<'a, [u16]> {
1556            let (head, tail) = self.split_normalized_utf16(text);
1557            if tail.is_empty() {
1558                return Cow::Borrowed(head);
1559            }
1560            let mut ret = alloc::vec::Vec::with_capacity(text.len());
1561            ret.extend_from_slice(head);
1562            let _ = self.normalize_utf16_to(tail, &mut ret);
1563            Cow::Owned(ret)
1564        }
1565
1566        /// Split a slice of potentially-invalid UTF-16 into maximum normalized (and valid)
1567        /// prefix and unnormalized suffix such that the concatenation of the prefix and the
1568        /// normalization of the suffix is the normalization of the whole input.
1569        ///
1570        /// ✨ *Enabled with the `utf16_iter` Cargo feature.*
1571        #[cfg(feature = "utf16_iter")]
1572        pub fn split_normalized_utf16<'a>(&self, text: &'a [u16]) -> (&'a [u16], &'a [u16]) {
1573            let up_to = self.is_normalized_utf16_up_to(text);
1574            text.split_at_checked(up_to).unwrap_or_else(|| {
1575                // Internal bug, not even GIGO, never supposed to happen
1576                debug_assert!(false);
1577                (&[], text)
1578            })
1579        }
1580
1581        /// Return the index a slice of potentially-invalid UTF-16 is normalized up to.
1582        ///
1583        /// ✨ *Enabled with the `utf16_iter` Cargo feature.*
1584        #[cfg(feature = "utf16_iter")]
1585        fn is_normalized_utf16_up_to(&self, text: &[u16]) -> usize {
1586            let mut sink = IsNormalizedSinkUtf16::new(text);
1587            let _ = self.normalize_utf16_to(text, &mut sink);
1588            text.len() - sink.remaining_len()
1589        }
1590
1591        /// Checks whether a slice of potentially-invalid UTF-16 is normalized.
1592        ///
1593        /// Unpaired surrogates are treated as the REPLACEMENT CHARACTER.
1594        ///
1595        /// ✨ *Enabled with the `utf16_iter` Cargo feature.*
1596        #[cfg(feature = "utf16_iter")]
1597        pub fn is_normalized_utf16(&self, text: &[u16]) -> bool {
1598            self.is_normalized_utf16_up_to(text) == text.len()
1599        }
1600
1601        /// Normalize a slice of potentially-invalid UTF-8 into a `Cow<'a, str>`.
1602        ///
1603        /// Ill-formed byte sequences are mapped to the REPLACEMENT CHARACTER
1604        /// according to the WHATWG Encoding Standard.
1605        ///
1606        /// ✨ *Enabled with the `utf8_iter` Cargo feature.*
1607        #[cfg(feature = "utf8_iter")]
1608        pub fn normalize_utf8<'a>(&self, text: &'a [u8]) -> Cow<'a, str> {
1609            let (head, tail) = self.split_normalized_utf8(text);
1610            if tail.is_empty() {
1611                return Cow::Borrowed(head);
1612            }
1613            let mut ret = String::new();
1614            ret.reserve(text.len());
1615            ret.push_str(head);
1616            let _ = self.normalize_utf8_to(tail, &mut ret);
1617            Cow::Owned(ret)
1618        }
1619
1620        /// Split a slice of potentially-invalid UTF-8 into maximum normalized (and valid)
1621        /// prefix and unnormalized suffix such that the concatenation of the prefix and the
1622        /// normalization of the suffix is the normalization of the whole input.
1623        ///
1624        /// ✨ *Enabled with the `utf8_iter` Cargo feature.*
1625        #[cfg(feature = "utf8_iter")]
1626        pub fn split_normalized_utf8<'a>(&self, text: &'a [u8]) -> (&'a str, &'a [u8]) {
1627            let up_to = self.is_normalized_utf8_up_to(text);
1628            let (head, tail) = text.split_at_checked(up_to).unwrap_or_else(|| {
1629                // Internal bug, not even GIGO, never supposed to happen
1630                debug_assert!(false);
1631                (&[], text)
1632            });
1633            // SAFETY: The normalization check also checks for
1634            // UTF-8 well-formedness.
1635            (unsafe { core::str::from_utf8_unchecked(head) }, tail)
1636        }
1637
1638        /// Return the index a slice of potentially-invalid UTF-8 is normalized up to
1639        ///
1640        /// ✨ *Enabled with the `utf8_iter` Cargo feature.*
1641        #[cfg(feature = "utf8_iter")]
1642        fn is_normalized_utf8_up_to(&self, text: &[u8]) -> usize {
1643            let mut sink = IsNormalizedSinkUtf8::new(text);
1644            let _ = self.normalize_utf8_to(text, &mut sink);
1645            text.len() - sink.remaining_len()
1646        }
1647
1648        /// Check if a slice of potentially-invalid UTF-8 is normalized.
1649        ///
1650        /// Ill-formed byte sequences are mapped to the REPLACEMENT CHARACTER
1651        /// according to the WHATWG Encoding Standard before checking.
1652        ///
1653        /// ✨ *Enabled with the `utf8_iter` Cargo feature.*
1654        #[cfg(feature = "utf8_iter")]
1655        pub fn is_normalized_utf8(&self, text: &[u8]) -> bool {
1656            self.is_normalized_utf8_up_to(text) == text.len()
1657        }
1658    };
1659}
1660
1661/// Borrowed version of a normalizer for performing decomposing normalization.
1662#[derive(Debug)]
1663pub struct DecomposingNormalizerBorrowed<'a> {
1664    decompositions: &'a DecompositionData<'a>,
1665    tables: &'a DecompositionTables<'a>,
1666    supplementary_tables: Option<&'a DecompositionTables<'a>>,
1667    decomposition_passthrough_bound: u8, // never above 0xC0
1668    composition_passthrough_bound: u16,  // never above 0x0300
1669}
1670
1671impl DecomposingNormalizerBorrowed<'static> {
1672    /// Cheaply converts a [`DecomposingNormalizerBorrowed<'static>`] into a [`DecomposingNormalizer`].
1673    ///
1674    /// Note: Due to branching and indirection, using [`DecomposingNormalizer`] might inhibit some
1675    /// compile-time optimizations that are possible with [`DecomposingNormalizerBorrowed`].
1676    pub const fn static_to_owned(self) -> DecomposingNormalizer {
1677        DecomposingNormalizer {
1678            decompositions: DataPayload::from_static_ref(self.decompositions),
1679            tables: DataPayload::from_static_ref(self.tables),
1680            supplementary_tables: if let Some(s) = self.supplementary_tables {
1681                // `map` not available in const context
1682                Some(DataPayload::from_static_ref(s))
1683            } else {
1684                None
1685            },
1686            decomposition_passthrough_bound: self.decomposition_passthrough_bound,
1687            composition_passthrough_bound: self.composition_passthrough_bound,
1688        }
1689    }
1690
1691    /// NFD constructor using compiled data.
1692    ///
1693    /// ✨ *Enabled with the `compiled_data` Cargo feature.*
1694    ///
1695    /// [📚 Help choosing a constructor](icu_provider::constructors)
1696    #[cfg(feature = "compiled_data")]
1697    pub const fn new_nfd() -> Self {
1698        const _: () = assert!(
1699            crate::provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1
1700                .scalars16
1701                .const_len()
1702                + crate::provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1
1703                    .scalars24
1704                    .const_len()
1705                <= 0xFFF,
1706            "future extension"
1707        );
1708
1709        DecomposingNormalizerBorrowed {
1710            decompositions: crate::provider::Baked::SINGLETON_NORMALIZER_NFD_DATA_V1,
1711            tables: crate::provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1,
1712            supplementary_tables: None,
1713            decomposition_passthrough_bound: 0xC0,
1714            composition_passthrough_bound: 0x0300,
1715        }
1716    }
1717
1718    /// NFKD constructor using compiled data.
1719    ///
1720    /// ✨ *Enabled with the `compiled_data` Cargo feature.*
1721    ///
1722    /// [📚 Help choosing a constructor](icu_provider::constructors)
1723    #[cfg(feature = "compiled_data")]
1724    pub const fn new_nfkd() -> Self {
1725        const _: () = assert!(
1726            crate::provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1
1727                .scalars16
1728                .const_len()
1729                + crate::provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1
1730                    .scalars24
1731                    .const_len()
1732                + crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_TABLES_V1
1733                    .scalars16
1734                    .const_len()
1735                + crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_TABLES_V1
1736                    .scalars24
1737                    .const_len()
1738                <= 0xFFF,
1739            "future extension"
1740        );
1741
1742        const _: () = assert!(
1743            crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_DATA_V1.passthrough_cap <= 0x0300,
1744            "invalid"
1745        );
1746
1747        let decomposition_capped =
1748            if crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_DATA_V1.passthrough_cap < 0xC0 {
1749                crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_DATA_V1.passthrough_cap
1750            } else {
1751                0xC0
1752            };
1753        let composition_capped =
1754            if crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_DATA_V1.passthrough_cap < 0x0300 {
1755                crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_DATA_V1.passthrough_cap
1756            } else {
1757                0x0300
1758            };
1759
1760        DecomposingNormalizerBorrowed {
1761            decompositions: crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_DATA_V1,
1762            tables: crate::provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1,
1763            supplementary_tables: Some(crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_TABLES_V1),
1764            decomposition_passthrough_bound: decomposition_capped as u8,
1765            composition_passthrough_bound: composition_capped,
1766        }
1767    }
1768
1769    #[cfg(feature = "compiled_data")]
1770    pub(crate) const fn new_uts46_decomposed() -> Self {
1771        const _: () = assert!(
1772            crate::provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1
1773                .scalars16
1774                .const_len()
1775                + crate::provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1
1776                    .scalars24
1777                    .const_len()
1778                + crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_TABLES_V1
1779                    .scalars16
1780                    .const_len()
1781                + crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_TABLES_V1
1782                    .scalars24
1783                    .const_len()
1784                <= 0xFFF,
1785            "future extension"
1786        );
1787
1788        const _: () = assert!(
1789            crate::provider::Baked::SINGLETON_NORMALIZER_UTS46_DATA_V1.passthrough_cap <= 0x0300,
1790            "invalid"
1791        );
1792
1793        let decomposition_capped =
1794            if crate::provider::Baked::SINGLETON_NORMALIZER_UTS46_DATA_V1.passthrough_cap < 0xC0 {
1795                crate::provider::Baked::SINGLETON_NORMALIZER_UTS46_DATA_V1.passthrough_cap
1796            } else {
1797                0xC0
1798            };
1799        let composition_capped = if crate::provider::Baked::SINGLETON_NORMALIZER_UTS46_DATA_V1
1800            .passthrough_cap
1801            < 0x0300
1802        {
1803            crate::provider::Baked::SINGLETON_NORMALIZER_UTS46_DATA_V1.passthrough_cap
1804        } else {
1805            0x0300
1806        };
1807
1808        DecomposingNormalizerBorrowed {
1809            decompositions: crate::provider::Baked::SINGLETON_NORMALIZER_UTS46_DATA_V1,
1810            tables: crate::provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1,
1811            supplementary_tables: Some(crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_TABLES_V1),
1812            decomposition_passthrough_bound: decomposition_capped as u8,
1813            composition_passthrough_bound: composition_capped,
1814        }
1815    }
1816}
1817
1818impl<'data> DecomposingNormalizerBorrowed<'data> {
1819    /// NFD constructor using already-loaded data.
1820    ///
1821    /// This constructor is intended for use by collations.
1822    ///
1823    /// [📚 Help choosing a constructor](icu_provider::constructors)
1824    #[doc(hidden)]
1825    pub fn new_with_data(
1826        decompositions: &'data DecompositionData<'data>,
1827        tables: &'data DecompositionTables<'data>,
1828    ) -> Self {
1829        Self {
1830            decompositions,
1831            tables,
1832            supplementary_tables: None,
1833            decomposition_passthrough_bound: 0xC0,
1834            composition_passthrough_bound: 0x0300,
1835        }
1836    }
1837
1838    /// Wraps a delegate iterator into a decomposing iterator
1839    /// adapter by using the data already held by this normalizer.
1840    pub fn normalize_iter<I: Iterator<Item = char>>(&self, iter: I) -> Decomposition<'data, I> {
1841        Decomposition::new_with_supplements(
1842            iter,
1843            self.decompositions,
1844            self.tables,
1845            self.supplementary_tables,
1846            self.decomposition_passthrough_bound,
1847            IgnorableBehavior::Unsupported,
1848        )
1849    }
1850
1851    normalizer_methods!();
1852
1853    decomposing_normalize_to!(
1854        /// Normalize a string slice into a `Write` sink.
1855        ,
1856        normalize_to,
1857        core::fmt::Write,
1858        &str,
1859        {
1860        },
1861        as_str,
1862        {
1863            let decomposition_passthrough_byte_bound = if decomposition_passthrough_bound == 0xC0 {
1864                0xC3u8
1865            } else {
1866                decomposition_passthrough_bound.min(0x80) as u8
1867            };
1868            // The attribute belongs on an inner statement, but Rust doesn't allow it there.
1869            #[expect(clippy::unwrap_used)]
1870            'fast: loop {
1871                let mut code_unit_iter = decomposition.delegate.as_str().as_bytes().iter();
1872                'fastest: loop {
1873                    if let Some(&upcoming_byte) = code_unit_iter.next() {
1874                        if upcoming_byte < decomposition_passthrough_byte_bound {
1875                            // Fast-track succeeded!
1876                            continue 'fastest;
1877                        }
1878                        // This deliberately isn't panic-free, since the code pattern
1879                        // that was OK for the composing counterpart regressed
1880                        // English and French performance if done here, too.
1881                        decomposition.delegate = pending_slice[pending_slice.len() - code_unit_iter.as_slice().len() - 1..].chars();
1882                        break 'fastest;
1883                    }
1884                    // End of stream
1885                    sink.write_str(pending_slice)?;
1886                    return Ok(());
1887                }
1888
1889                // `unwrap()` OK, because the slice is valid UTF-8 and we know there
1890                // is an upcoming byte.
1891                let upcoming = decomposition.delegate.next().unwrap();
1892                let upcoming_with_trie_value = decomposition.attach_trie_value(upcoming);
1893                if upcoming_with_trie_value.starter_and_decomposes_to_self() {
1894                    continue 'fast;
1895                }
1896                let consumed_so_far_slice = &pending_slice[..pending_slice.len()
1897                    - decomposition.delegate.as_str().len()
1898                    - upcoming.len_utf8()];
1899                sink.write_str(consumed_so_far_slice)?;
1900
1901                // Now let's figure out if we got a starter or a non-starter.
1902                if decomposition_starts_with_non_starter(
1903                    upcoming_with_trie_value.trie_val,
1904                ) {
1905                    // Let this trie value to be reprocessed in case it is
1906                    // one of the rare decomposing ones.
1907                    decomposition.pending = Some(upcoming_with_trie_value);
1908                    decomposition.gather_and_sort_combining(0);
1909                    continue 'outer;
1910                }
1911                undecomposed_starter = upcoming_with_trie_value;
1912                debug_assert!(decomposition.pending.is_none());
1913                break 'fast;
1914            }
1915        },
1916        text,
1917        sink,
1918        decomposition,
1919        decomposition_passthrough_bound,
1920        undecomposed_starter,
1921        pending_slice,
1922        'outer,
1923    );
1924
1925    decomposing_normalize_to!(
1926        /// Normalize a slice of potentially-invalid UTF-8 into a `Write` sink.
1927        ///
1928        /// Ill-formed byte sequences are mapped to the REPLACEMENT CHARACTER
1929        /// according to the WHATWG Encoding Standard.
1930        ///
1931        /// ✨ *Enabled with the `utf8_iter` Cargo feature.*
1932        #[cfg(feature = "utf8_iter")]
1933        ,
1934        normalize_utf8_to,
1935        core::fmt::Write,
1936        &[u8],
1937        {
1938        },
1939        as_slice,
1940        {
1941            let decomposition_passthrough_byte_bound = decomposition_passthrough_bound.min(0x80) as u8;
1942            'fast: loop {
1943                let mut code_unit_iter = decomposition.delegate.as_slice().iter();
1944                'fastest: loop {
1945                    if let Some(&upcoming_byte) = code_unit_iter.next() {
1946                        if upcoming_byte < decomposition_passthrough_byte_bound {
1947                            // Fast-track succeeded!
1948                            continue 'fastest;
1949                        }
1950                        break 'fastest;
1951                    }
1952                    // End of stream
1953                    sink.write_str(unsafe { core::str::from_utf8_unchecked(pending_slice) })?;
1954                    return Ok(());
1955                }
1956                #[expect(clippy::indexing_slicing)]
1957                {decomposition.delegate = pending_slice[pending_slice.len() - code_unit_iter.as_slice().len() - 1..].chars();}
1958
1959                // `unwrap()` OK, because the slice is valid UTF-8 and we know there
1960                // is an upcoming byte.
1961                #[expect(clippy::unwrap_used)]
1962                let upcoming = decomposition.delegate.next().unwrap();
1963                let upcoming_with_trie_value = decomposition.attach_trie_value(upcoming);
1964                if upcoming_with_trie_value.starter_and_decomposes_to_self_except_replacement() {
1965                    // Note: The trie value of the REPLACEMENT CHARACTER is
1966                    // intentionally formatted to fail the
1967                    // `starter_and_decomposes_to_self` test even though it
1968                    // really is a starter that decomposes to self. This
1969                    // Allows moving the branch on REPLACEMENT CHARACTER
1970                    // below this `continue`.
1971                    continue 'fast;
1972                }
1973
1974                // TODO: Annotate as unlikely.
1975                if upcoming == REPLACEMENT_CHARACTER {
1976                    // We might have an error, so fall out of the fast path.
1977
1978                    // Since the U+FFFD might signify an error, we can't
1979                    // assume `upcoming.len_utf8()` for the backoff length.
1980                    #[expect(clippy::indexing_slicing)]
1981                    let mut consumed_so_far = pending_slice[..pending_slice.len() - decomposition.delegate.as_slice().len()].chars();
1982                    let back = consumed_so_far.next_back();
1983                    debug_assert_eq!(back, Some(REPLACEMENT_CHARACTER));
1984                    let consumed_so_far_slice = consumed_so_far.as_slice();
1985                    sink.write_str(unsafe { core::str::from_utf8_unchecked(consumed_so_far_slice) } )?;
1986
1987                    // We could call `gather_and_sort_combining` here and
1988                    // `continue 'outer`, but this should be better for code
1989                    // size.
1990                    undecomposed_starter = upcoming_with_trie_value;
1991                    debug_assert!(decomposition.pending.is_none());
1992                    break 'fast;
1993                }
1994
1995                #[expect(clippy::indexing_slicing)]
1996                let consumed_so_far_slice = &pending_slice[..pending_slice.len()
1997                    - decomposition.delegate.as_slice().len()
1998                    - upcoming.len_utf8()];
1999                sink.write_str(unsafe { core::str::from_utf8_unchecked(consumed_so_far_slice) } )?;
2000
2001                // Now let's figure out if we got a starter or a non-starter.
2002                if decomposition_starts_with_non_starter(
2003                    upcoming_with_trie_value.trie_val,
2004                ) {
2005                    // Let this trie value to be reprocessed in case it is
2006                    // one of the rare decomposing ones.
2007                    decomposition.pending = Some(upcoming_with_trie_value);
2008                    decomposition.gather_and_sort_combining(0);
2009                    continue 'outer;
2010                }
2011                undecomposed_starter = upcoming_with_trie_value;
2012                debug_assert!(decomposition.pending.is_none());
2013                break 'fast;
2014            }
2015        },
2016        text,
2017        sink,
2018        decomposition,
2019        decomposition_passthrough_bound,
2020        undecomposed_starter,
2021        pending_slice,
2022        'outer,
2023    );
2024
2025    decomposing_normalize_to!(
2026        /// Normalize a slice of potentially-invalid UTF-16 into a `Write16` sink.
2027        ///
2028        /// Unpaired surrogates are mapped to the REPLACEMENT CHARACTER
2029        /// before normalizing.
2030        ///
2031        /// ✨ *Enabled with the `utf16_iter` Cargo feature.*
2032        #[cfg(feature = "utf16_iter")]
2033        ,
2034        normalize_utf16_to,
2035        write16::Write16,
2036        &[u16],
2037        {
2038            sink.size_hint(text.len())?;
2039        },
2040        as_slice,
2041        {
2042            // This loop is only broken out of as goto forward and only as release-build recovery from
2043            // detecting an internal bug without panic. (In debug builds, internal bugs panic instead.)
2044            #[expect(clippy::never_loop)]
2045            'fastwrap: loop {
2046                // Commented out `code_unit_iter` and used `ptr` and `end` to
2047                // work around https://github.com/rust-lang/rust/issues/144684 .
2048                //
2049                // let mut code_unit_iter = decomposition.delegate.as_slice().iter();
2050                let delegate_as_slice = decomposition.delegate.as_slice();
2051                let mut ptr: *const u16 = delegate_as_slice.as_ptr();
2052                // SAFETY: materializing a pointer immediately past the end of an
2053                // allocation is OK.
2054                let end: *const u16 = unsafe { ptr.add(delegate_as_slice.len()) };
2055                'fast: loop {
2056                    // if let Some(&upcoming_code_unit) = code_unit_iter.next() {
2057                    if ptr != end {
2058                        // SAFETY: We just checked that `ptr` has not reached `end`.
2059                        // `ptr` always advances by one, and we always have a check
2060                        // per advancement.
2061                        let upcoming_code_unit = unsafe { *ptr };
2062                        // SAFETY: Since `ptr` hadn't reached `end`, yet, advancing
2063                        // by one points to the same allocation or to immediately
2064                        // after, which is OK.
2065                        ptr = unsafe { ptr.add(1) };
2066
2067                        let mut upcoming32 = u32::from(upcoming_code_unit);
2068                        // The performance of what logically is supposed to be this
2069                        // branch is _incredibly_ brittle and what LLVM ends up doing
2070                        // that affects the performance of what's logically about this
2071                        // decision can swing to double/halve the throughput for Basic
2072                        // Latin in ways that are completely unintuitive. Basically _any_
2073                        // change to _any_ code that participates in how LLVM sees the
2074                        // code around here can make the perf fall over. In seems that
2075                        // manually annotating this branch as likely has worse effects
2076                        // on non-Basic-Latin input that the case where LLVM just happens to
2077                        // do the right thing.
2078                        //
2079                        // What happens with this branch may depend on what sink type
2080                        // this code is monomorphized over.
2081                        //
2082                        // What a terrible sink of developer time!
2083                        if upcoming32 < decomposition_passthrough_bound {
2084                            continue 'fast;
2085                        }
2086                        // We might be doing a trie lookup by surrogate. Surrogates get
2087                        // a decomposition to U+FFFD.
2088                        let mut trie_value = decomposition.trie.get16(upcoming_code_unit);
2089                        if starter_and_decomposes_to_self_impl(trie_value) {
2090                            continue 'fast;
2091                        }
2092                        // We might now be looking at a surrogate.
2093                        // The loop is only broken out of as goto forward
2094                        #[expect(clippy::never_loop)]
2095                        'surrogateloop: loop {
2096                            // LLVM's optimizations are incredibly brittle for the code _above_,
2097                            // and using `likely` _below_ without using it _above_ helps!
2098                            // What a massive sink of developer time!
2099                            // Seriously, the effect of these annotations is massively
2100                            // unintuitive. Measure everything!
2101                            // Notably, the `if likely(...)` formulation optimizes differently
2102                            // than just putting `cold_path()` on the `else` path!
2103                            let surrogate_base = upcoming32.wrapping_sub(0xD800);
2104                            if likely(surrogate_base > (0xDFFF - 0xD800)) {
2105                                // Not surrogate
2106                                break 'surrogateloop;
2107                            }
2108                            if likely(surrogate_base <= (0xDBFF - 0xD800)) {
2109                                // let iter_backup = code_unit_iter.clone();
2110                                // if let Some(&low) = code_unit_iter.next() {
2111                                if ptr != end {
2112                                    // SAFETY: We just checked that `ptr` has not reached `end`.
2113                                    // `ptr` always advances by one, and we always have a check
2114                                    // per advancement.
2115                                    let low = unsafe { *ptr };
2116                                    if likely(in_inclusive_range16(low, 0xDC00, 0xDFFF)) {
2117                                        // SAFETY: Since `ptr` hadn't reached `end`, yet, advancing
2118                                        // by one points to the same allocation or to immediately
2119                                        // after, which is OK.
2120                                        ptr = unsafe { ptr.add(1) };
2121
2122                                        upcoming32 = (upcoming32 << 10) + u32::from(low)
2123                                            - (((0xD800u32 << 10) - 0x10000u32) + 0xDC00u32);
2124                                        // Successfully-paired surrogate. Read from the trie again.
2125                                        trie_value = {
2126                                            // Semantically, this bit of conditional compilation makes no sense.
2127                                            // The purpose is to keep LLVM seeing the untyped trie case the way
2128                                            // it did before so as not to regress the performance of the untyped
2129                                            // case due to unintuitive optimizer effects. If you care about the
2130                                            // perf of the untyped trie case and have better ideas, please try
2131                                            // something better.
2132                                            #[cfg(not(icu4x_unstable_fast_trie_only))]
2133                                            {decomposition.trie.get32(upcoming32)}
2134                                            #[cfg(icu4x_unstable_fast_trie_only)]
2135                                            {decomposition.trie.get32_supplementary(upcoming32)}
2136                                        };
2137                                        if likely(starter_and_decomposes_to_self_impl(trie_value)) {
2138                                            continue 'fast;
2139                                        }
2140                                        break 'surrogateloop;
2141                                    // } else {
2142                                    //     code_unit_iter = iter_backup;
2143                                    }
2144                                }
2145                            }
2146                            // unpaired surrogate
2147                            upcoming32 = 0xFFFD; // Safe value for `char::from_u32_unchecked` and matches later potential error check.
2148                            // trie_value already holds a decomposition to U+FFFD.
2149                            break 'surrogateloop;
2150                        }
2151
2152                        let upcoming = unsafe { char::from_u32_unchecked(upcoming32) };
2153                        let upcoming_with_trie_value = CharacterAndTrieValue::new(upcoming, trie_value);
2154
2155
2156                        let Some(consumed_so_far_slice) = pending_slice.get(..pending_slice.len() -
2157                            // code_unit_iter.as_slice().len()
2158                            // SAFETY: `ptr` and `end` have been derived from the same allocation
2159                            // and `ptr` is never greater than `end`.
2160                            unsafe { end.offset_from(ptr) as usize }
2161                            - upcoming.len_utf16()) else {
2162                            // If we ever come here, it's a bug, but let's avoid panic code paths in release builds.
2163                            debug_assert!(false);
2164                            // Throw away the results of the fast path.
2165                            break 'fastwrap;
2166                        };
2167                        sink.write_slice(consumed_so_far_slice)?;
2168
2169                        if decomposition_starts_with_non_starter(
2170                            upcoming_with_trie_value.trie_val,
2171                        ) {
2172                            // Sync with main iterator
2173                            // decomposition.delegate = code_unit_iter.as_slice().chars();
2174                            // SAFETY: `ptr` and `end` have been derived from the same allocation
2175                            // and `ptr` is never greater than `end`.
2176                            decomposition.delegate = unsafe { core::slice::from_raw_parts(ptr, end.offset_from(ptr) as usize) }.chars();
2177                            // Let this trie value to be reprocessed in case it is
2178                            // one of the rare decomposing ones.
2179                            decomposition.pending = Some(upcoming_with_trie_value);
2180                            decomposition.gather_and_sort_combining(0);
2181                            continue 'outer;
2182                        }
2183                        undecomposed_starter = upcoming_with_trie_value;
2184                        debug_assert!(decomposition.pending.is_none());
2185                        break 'fast;
2186                    }
2187                    // End of stream
2188                    sink.write_slice(pending_slice)?;
2189                    return Ok(());
2190                }
2191                // Sync the main iterator
2192                // decomposition.delegate = code_unit_iter.as_slice().chars();
2193                // SAFETY: `ptr` and `end` have been derived from the same allocation
2194                // and `ptr` is never greater than `end`.
2195                decomposition.delegate = unsafe { core::slice::from_raw_parts(ptr, end.offset_from(ptr) as usize) }.chars();
2196                break 'fastwrap;
2197            }
2198        },
2199        text,
2200        sink,
2201        decomposition,
2202        decomposition_passthrough_bound,
2203        undecomposed_starter,
2204        pending_slice,
2205        'outer,
2206    );
2207}
2208
2209/// A normalizer for performing decomposing normalization.
2210#[derive(Debug)]
2211pub struct DecomposingNormalizer {
2212    decompositions: DataPayload<NormalizerNfdDataV1>,
2213    tables: DataPayload<NormalizerNfdTablesV1>,
2214    supplementary_tables: Option<DataPayload<NormalizerNfkdTablesV1>>,
2215    decomposition_passthrough_bound: u8, // never above 0xC0
2216    composition_passthrough_bound: u16,  // never above 0x0300
2217}
2218
2219impl DecomposingNormalizer {
2220    /// Constructs a borrowed version of this type for more efficient querying.
2221    pub fn as_borrowed(&self) -> DecomposingNormalizerBorrowed<'_> {
2222        DecomposingNormalizerBorrowed {
2223            decompositions: self.decompositions.get(),
2224            tables: self.tables.get(),
2225            supplementary_tables: self.supplementary_tables.as_ref().map(|s| s.get()),
2226            decomposition_passthrough_bound: self.decomposition_passthrough_bound,
2227            composition_passthrough_bound: self.composition_passthrough_bound,
2228        }
2229    }
2230
2231    /// NFD constructor using compiled data.
2232    ///
2233    /// ✨ *Enabled with the `compiled_data` Cargo feature.*
2234    ///
2235    /// [📚 Help choosing a constructor](icu_provider::constructors)
2236    #[cfg(feature = "compiled_data")]
2237    pub const fn new_nfd() -> DecomposingNormalizerBorrowed<'static> {
2238        DecomposingNormalizerBorrowed::new_nfd()
2239    }
2240
2241    icu_provider::gen_buffer_data_constructors!(
2242        () -> error: DataError,
2243        functions: [
2244            new_nfd: skip,
2245            try_new_nfd_with_buffer_provider,
2246            try_new_nfd_unstable,
2247            Self,
2248        ]
2249    );
2250
2251    #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_nfd)]
2252    pub fn try_new_nfd_unstable<D>(provider: &D) -> Result<Self, DataError>
2253    where
2254        D: DataProvider<NormalizerNfdDataV1> + DataProvider<NormalizerNfdTablesV1> + ?Sized,
2255    {
2256        let decompositions: DataPayload<NormalizerNfdDataV1> =
2257            provider.load(Default::default())?.payload;
2258        let tables: DataPayload<NormalizerNfdTablesV1> = provider.load(Default::default())?.payload;
2259
2260        if tables.get().scalars16.len() + tables.get().scalars24.len() > 0xFFF {
2261            // The data is from a future where there exists a normalization flavor whose
2262            // complex decompositions take more than 0xFFF but fewer than 0x1FFF code points
2263            // of space. If a good use case from such a decomposition flavor arises, we can
2264            // dynamically change the bit masks so that the length mask becomes 0x1FFF instead
2265            // of 0xFFF and the all-non-starters mask becomes 0 instead of 0x1000. However,
2266            // since for now the masks are hard-coded, error out.
2267            return Err(
2268                DataError::custom("future extension").with_marker(NormalizerNfdTablesV1::INFO)
2269            );
2270        }
2271
2272        let cap = decompositions.get().passthrough_cap;
2273        if cap > 0x0300 {
2274            return Err(DataError::custom("invalid").with_marker(NormalizerNfdDataV1::INFO));
2275        }
2276        let decomposition_capped = cap.min(0xC0);
2277        let composition_capped = cap.min(0x0300);
2278
2279        Ok(DecomposingNormalizer {
2280            decompositions,
2281            tables,
2282            supplementary_tables: None,
2283            decomposition_passthrough_bound: decomposition_capped as u8,
2284            composition_passthrough_bound: composition_capped,
2285        })
2286    }
2287
2288    icu_provider::gen_buffer_data_constructors!(
2289        () -> error: DataError,
2290        functions: [
2291            new_nfkd: skip,
2292            try_new_nfkd_with_buffer_provider,
2293            try_new_nfkd_unstable,
2294            Self,
2295        ]
2296    );
2297
2298    /// NFKD constructor using compiled data.
2299    ///
2300    /// ✨ *Enabled with the `compiled_data` Cargo feature.*
2301    ///
2302    /// [📚 Help choosing a constructor](icu_provider::constructors)
2303    #[cfg(feature = "compiled_data")]
2304    pub const fn new_nfkd() -> DecomposingNormalizerBorrowed<'static> {
2305        DecomposingNormalizerBorrowed::new_nfkd()
2306    }
2307
2308    #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_nfkd)]
2309    pub fn try_new_nfkd_unstable<D>(provider: &D) -> Result<Self, DataError>
2310    where
2311        D: DataProvider<NormalizerNfkdDataV1>
2312            + DataProvider<NormalizerNfdTablesV1>
2313            + DataProvider<NormalizerNfkdTablesV1>
2314            + ?Sized,
2315    {
2316        let decompositions: DataPayload<NormalizerNfkdDataV1> =
2317            provider.load(Default::default())?.payload;
2318        let tables: DataPayload<NormalizerNfdTablesV1> = provider.load(Default::default())?.payload;
2319        let supplementary_tables: DataPayload<NormalizerNfkdTablesV1> =
2320            provider.load(Default::default())?.payload;
2321
2322        if tables.get().scalars16.len()
2323            + tables.get().scalars24.len()
2324            + supplementary_tables.get().scalars16.len()
2325            + supplementary_tables.get().scalars24.len()
2326            > 0xFFF
2327        {
2328            // The data is from a future where there exists a normalization flavor whose
2329            // complex decompositions take more than 0xFFF but fewer than 0x1FFF code points
2330            // of space. If a good use case from such a decomposition flavor arises, we can
2331            // dynamically change the bit masks so that the length mask becomes 0x1FFF instead
2332            // of 0xFFF and the all-non-starters mask becomes 0 instead of 0x1000. However,
2333            // since for now the masks are hard-coded, error out.
2334            return Err(
2335                DataError::custom("future extension").with_marker(NormalizerNfdTablesV1::INFO)
2336            );
2337        }
2338
2339        let cap = decompositions.get().passthrough_cap;
2340        if cap > 0x0300 {
2341            return Err(DataError::custom("invalid").with_marker(NormalizerNfkdDataV1::INFO));
2342        }
2343        let decomposition_capped = cap.min(0xC0);
2344        let composition_capped = cap.min(0x0300);
2345
2346        Ok(DecomposingNormalizer {
2347            decompositions: decompositions.cast(),
2348            tables,
2349            supplementary_tables: Some(supplementary_tables),
2350            decomposition_passthrough_bound: decomposition_capped as u8,
2351            composition_passthrough_bound: composition_capped,
2352        })
2353    }
2354
2355    /// UTS 46 decomposed constructor (testing only)
2356    ///
2357    /// This is a special building block normalization for IDNA. It is the decomposed counterpart of
2358    /// ICU4C's UTS 46 normalization with two exceptions: characters that UTS 46 disallows and
2359    /// ICU4C maps to U+FFFD and characters that UTS 46 maps to the empty string normalize as in
2360    /// NFD in this normalization. In both cases, the previous UTS 46 processing before using
2361    /// normalization is expected to deal with these characters. Making the disallowed characters
2362    /// behave like this is beneficial to data size, and this normalizer implementation cannot
2363    /// deal with a character normalizing to the empty string, which doesn't happen in NFD or
2364    /// NFKD as of Unicode 14.
2365    ///
2366    /// Warning: In this normalization, U+0345 COMBINING GREEK YPOGEGRAMMENI exhibits a behavior
2367    /// that no character in Unicode exhibits in NFD, NFKD, NFC, or NFKC: Case folding turns
2368    /// U+0345 from a reordered character into a non-reordered character before reordering happens.
2369    /// Therefore, the output of this normalization may differ for different inputs that are
2370    /// canonically equivalent with each other if they differ by how U+0345 is ordered relative
2371    /// to other reorderable characters.
2372    pub(crate) fn try_new_uts46_decomposed_unstable<D>(provider: &D) -> Result<Self, DataError>
2373    where
2374        D: DataProvider<NormalizerUts46DataV1>
2375            + DataProvider<NormalizerNfdTablesV1>
2376            + DataProvider<NormalizerNfkdTablesV1>
2377            // UTS 46 tables merged into CompatibilityDecompositionTablesV1
2378            + ?Sized,
2379    {
2380        let decompositions: DataPayload<NormalizerUts46DataV1> =
2381            provider.load(Default::default())?.payload;
2382        let tables: DataPayload<NormalizerNfdTablesV1> = provider.load(Default::default())?.payload;
2383        let supplementary_tables: DataPayload<NormalizerNfkdTablesV1> =
2384            provider.load(Default::default())?.payload;
2385
2386        if tables.get().scalars16.len()
2387            + tables.get().scalars24.len()
2388            + supplementary_tables.get().scalars16.len()
2389            + supplementary_tables.get().scalars24.len()
2390            > 0xFFF
2391        {
2392            // The data is from a future where there exists a normalization flavor whose
2393            // complex decompositions take more than 0xFFF but fewer than 0x1FFF code points
2394            // of space. If a good use case from such a decomposition flavor arises, we can
2395            // dynamically change the bit masks so that the length mask becomes 0x1FFF instead
2396            // of 0xFFF and the all-non-starters mask becomes 0 instead of 0x1000. However,
2397            // since for now the masks are hard-coded, error out.
2398            return Err(
2399                DataError::custom("future extension").with_marker(NormalizerNfdTablesV1::INFO)
2400            );
2401        }
2402
2403        let cap = decompositions.get().passthrough_cap;
2404        if cap > 0x0300 {
2405            return Err(DataError::custom("invalid").with_marker(NormalizerUts46DataV1::INFO));
2406        }
2407        let decomposition_capped = cap.min(0xC0);
2408        let composition_capped = cap.min(0x0300);
2409
2410        Ok(DecomposingNormalizer {
2411            decompositions: decompositions.cast(),
2412            tables,
2413            supplementary_tables: Some(supplementary_tables),
2414            decomposition_passthrough_bound: decomposition_capped as u8,
2415            composition_passthrough_bound: composition_capped,
2416        })
2417    }
2418}
2419
2420/// Borrowed version of a normalizer for performing composing normalization.
2421#[derive(Debug)]
2422pub struct ComposingNormalizerBorrowed<'a> {
2423    decomposing_normalizer: DecomposingNormalizerBorrowed<'a>,
2424    canonical_compositions: &'a CanonicalCompositions<'a>,
2425}
2426
2427impl ComposingNormalizerBorrowed<'static> {
2428    /// Cheaply converts a [`ComposingNormalizerBorrowed<'static>`] into a [`ComposingNormalizer`].
2429    ///
2430    /// Note: Due to branching and indirection, using [`ComposingNormalizer`] might inhibit some
2431    /// compile-time optimizations that are possible with [`ComposingNormalizerBorrowed`].
2432    pub const fn static_to_owned(self) -> ComposingNormalizer {
2433        ComposingNormalizer {
2434            decomposing_normalizer: self.decomposing_normalizer.static_to_owned(),
2435            canonical_compositions: DataPayload::from_static_ref(self.canonical_compositions),
2436        }
2437    }
2438
2439    /// NFC constructor using compiled data.
2440    ///
2441    /// ✨ *Enabled with the `compiled_data` Cargo feature.*
2442    ///
2443    /// [📚 Help choosing a constructor](icu_provider::constructors)
2444    #[cfg(feature = "compiled_data")]
2445    pub const fn new_nfc() -> Self {
2446        ComposingNormalizerBorrowed {
2447            decomposing_normalizer: DecomposingNormalizerBorrowed::new_nfd(),
2448            canonical_compositions: crate::provider::Baked::SINGLETON_NORMALIZER_NFC_V1,
2449        }
2450    }
2451
2452    /// NFKC constructor using compiled data.
2453    ///
2454    /// ✨ *Enabled with the `compiled_data` Cargo feature.*
2455    ///
2456    /// [📚 Help choosing a constructor](icu_provider::constructors)
2457    #[cfg(feature = "compiled_data")]
2458    pub const fn new_nfkc() -> Self {
2459        ComposingNormalizerBorrowed {
2460            decomposing_normalizer: DecomposingNormalizerBorrowed::new_nfkd(),
2461            canonical_compositions: crate::provider::Baked::SINGLETON_NORMALIZER_NFC_V1,
2462        }
2463    }
2464
2465    /// This is a special building block normalization for IDNA that implements parts of the Map
2466    /// step and the following Normalize step.
2467    ///
2468    /// Warning: In this normalization, U+0345 COMBINING GREEK YPOGEGRAMMENI exhibits a behavior
2469    /// that no character in Unicode exhibits in NFD, NFKD, NFC, or NFKC: Case folding turns
2470    /// U+0345 from a reordered character into a non-reordered character before reordering happens.
2471    /// Therefore, the output of this normalization may differ for different inputs that are
2472    /// canonically equivalents with each other if they differ by how U+0345 is ordered relative
2473    /// to other reorderable characters.
2474    #[cfg(feature = "compiled_data")]
2475    pub(crate) const fn new_uts46() -> Self {
2476        ComposingNormalizerBorrowed {
2477            decomposing_normalizer: DecomposingNormalizerBorrowed::new_uts46_decomposed(),
2478            canonical_compositions: crate::provider::Baked::SINGLETON_NORMALIZER_NFC_V1,
2479        }
2480    }
2481}
2482
2483impl<'data> ComposingNormalizerBorrowed<'data> {
2484    /// Wraps a delegate iterator into a composing iterator
2485    /// adapter by using the data already held by this normalizer.
2486    pub fn normalize_iter<I: Iterator<Item = char>>(&self, iter: I) -> Composition<'data, I> {
2487        self.normalize_iter_private(iter, IgnorableBehavior::Unsupported)
2488    }
2489
2490    fn normalize_iter_private<I: Iterator<Item = char>>(
2491        &self,
2492        iter: I,
2493        ignorable_behavior: IgnorableBehavior,
2494    ) -> Composition<'data, I> {
2495        Composition::new(
2496            Decomposition::new_with_supplements(
2497                iter,
2498                self.decomposing_normalizer.decompositions,
2499                self.decomposing_normalizer.tables,
2500                self.decomposing_normalizer.supplementary_tables,
2501                self.decomposing_normalizer.decomposition_passthrough_bound,
2502                ignorable_behavior,
2503            ),
2504            self.canonical_compositions.canonical_compositions.clone(),
2505            self.decomposing_normalizer.composition_passthrough_bound,
2506        )
2507    }
2508
2509    normalizer_methods!();
2510
2511    composing_normalize_to!(
2512        /// Normalize a string slice into a `Write` sink.
2513        ,
2514        normalize_to,
2515        core::fmt::Write,
2516        &str,
2517        {},
2518        true,
2519        as_str,
2520        {
2521            // Let's hope LICM hoists this outside `'outer`.
2522            let composition_passthrough_byte_bound = if composition_passthrough_bound == 0x300 {
2523                0xCCu8
2524            } else {
2525                // We can make this fancy if a normalization other than NFC where looking at
2526                // non-ASCII lead bytes is worthwhile is ever introduced.
2527                composition_passthrough_bound.min(0x80) as u8
2528            };
2529            // Attributes have to be on blocks, so hoisting all the way here.
2530            #[expect(clippy::unwrap_used)]
2531            'fast: loop {
2532                let mut code_unit_iter = composition.decomposition.delegate.as_str().as_bytes().iter();
2533                'fastest: loop {
2534                    if let Some(&upcoming_byte) = code_unit_iter.next() {
2535                        if upcoming_byte < composition_passthrough_byte_bound {
2536                            // Fast-track succeeded!
2537                            continue 'fastest;
2538                        }
2539                        let Some(remaining_slice) = pending_slice.get(pending_slice.len() - code_unit_iter.as_slice().len() - 1..) else {
2540                            // If we ever come here, it's an internal bug. Let's avoid panic code paths in release builds.
2541                            debug_assert!(false);
2542                            // Throw away the fastest-path result in case of an internal bug.
2543                            break 'fastest;
2544                        };
2545                        composition.decomposition.delegate = remaining_slice.chars();
2546                        break 'fastest;
2547                    }
2548                    // End of stream
2549                    sink.write_str(pending_slice)?;
2550                    return Ok(());
2551                }
2552                // `unwrap()` OK, because the slice is valid UTF-8 and we know there
2553                // is an upcoming byte.
2554                let upcoming = composition.decomposition.delegate.next().unwrap();
2555                let upcoming_with_trie_value = composition.decomposition.attach_trie_value(upcoming);
2556                if upcoming_with_trie_value.potential_passthrough_and_cannot_combine_backwards() {
2557                    // Can't combine backwards, hence a plain (non-backwards-combining)
2558                    // starter albeit past `composition_passthrough_bound`
2559
2560                    // Fast-track succeeded!
2561                    continue 'fast;
2562                }
2563                // We need to fall off the fast path.
2564                composition.decomposition.pending = Some(upcoming_with_trie_value);
2565
2566                // slicing and unwrap OK, because we've just evidently read enough previously.
2567                let mut consumed_so_far = pending_slice[..pending_slice.len() - composition.decomposition.delegate.as_str().len() - upcoming.len_utf8()].chars();
2568                // `unwrap` OK, because we've previously manage to read the previous character
2569                undecomposed_starter = composition.decomposition.attach_trie_value(consumed_so_far.next_back().unwrap());
2570                let consumed_so_far_slice = consumed_so_far.as_str();
2571                sink.write_str(consumed_so_far_slice)?;
2572                break 'fast;
2573            }
2574        },
2575        text,
2576        sink,
2577        composition,
2578        composition_passthrough_bound,
2579        undecomposed_starter,
2580        pending_slice,
2581        len_utf8,
2582    );
2583
2584    composing_normalize_to!(
2585        /// Normalize a slice of potentially-invalid UTF-8 into a `Write` sink.
2586        ///
2587        /// Ill-formed byte sequences are mapped to the REPLACEMENT CHARACTER
2588        /// according to the WHATWG Encoding Standard.
2589        ///
2590        /// ✨ *Enabled with the `utf8_iter` Cargo feature.*
2591        #[cfg(feature = "utf8_iter")]
2592        ,
2593        normalize_utf8_to,
2594        core::fmt::Write,
2595        &[u8],
2596        {},
2597        false,
2598        as_slice,
2599        {
2600            'fast: loop {
2601                if let Some(upcoming) = composition.decomposition.delegate.next() {
2602                    if u32::from(upcoming) < composition_passthrough_bound {
2603                        // Fast-track succeeded!
2604                        continue 'fast;
2605                    }
2606                    // TODO: Be statically aware of fast/small trie.
2607                    let upcoming_with_trie_value = composition.decomposition.attach_trie_value(upcoming);
2608                    if upcoming_with_trie_value.potential_passthrough_and_cannot_combine_backwards() {
2609                        // Note: The trie value of the REPLACEMENT CHARACTER is
2610                        // intentionally formatted to fail the
2611                        // `potential_passthrough_and_cannot_combine_backwards`
2612                        // test even though it really is a starter that decomposes
2613                        // to self and cannot combine backwards. This
2614                        // Allows moving the branch on REPLACEMENT CHARACTER
2615                        // below this `continue`.
2616                        continue 'fast;
2617                    }
2618                    // We need to fall off the fast path.
2619
2620                    // TODO(#2006): Annotate as unlikely
2621                    if upcoming == REPLACEMENT_CHARACTER {
2622                        // Can't tell if this is an error or a literal U+FFFD in
2623                        // the input. Assuming the former to be sure.
2624
2625                        // Since the U+FFFD might signify an error, we can't
2626                        // assume `upcoming.len_utf8()` for the backoff length.
2627                        #[expect(clippy::indexing_slicing)]
2628                        let mut consumed_so_far = pending_slice[..pending_slice.len() - composition.decomposition.delegate.as_slice().len()].chars();
2629                        let back = consumed_so_far.next_back();
2630                        debug_assert_eq!(back, Some(REPLACEMENT_CHARACTER));
2631                        let consumed_so_far_slice = consumed_so_far.as_slice();
2632                        sink.write_str(unsafe { core::str::from_utf8_unchecked(consumed_so_far_slice) })?;
2633                        undecomposed_starter = CharacterAndTrieValue::new(REPLACEMENT_CHARACTER, 0);
2634                        composition.decomposition.pending = None;
2635                        break 'fast;
2636                    }
2637
2638                    composition.decomposition.pending = Some(upcoming_with_trie_value);
2639                    // slicing and unwrap OK, because we've just evidently read enough previously.
2640                    // `unwrap` OK, because we've previously manage to read the previous character
2641                    #[expect(clippy::indexing_slicing)]
2642                    let mut consumed_so_far = pending_slice[..pending_slice.len() - composition.decomposition.delegate.as_slice().len() - upcoming.len_utf8()].chars();
2643                    #[expect(clippy::unwrap_used)]
2644                    {
2645                        // TODO: If the previous character was below the passthrough bound,
2646                        // we really need to read from the trie. Otherwise, we could maintain
2647                        // the most-recent trie value. Need to measure what's more expensive:
2648                        // Remembering the trie value on each iteration or re-reading the
2649                        // last one after the fast-track run.
2650                        undecomposed_starter = composition.decomposition.attach_trie_value(consumed_so_far.next_back().unwrap());
2651                    }
2652                    let consumed_so_far_slice = consumed_so_far.as_slice();
2653                    sink.write_str(unsafe { core::str::from_utf8_unchecked(consumed_so_far_slice)})?;
2654                    break 'fast;
2655                }
2656                // End of stream
2657                sink.write_str(unsafe { core::str::from_utf8_unchecked(pending_slice) })?;
2658                return Ok(());
2659            }
2660        },
2661        text,
2662        sink,
2663        composition,
2664        composition_passthrough_bound,
2665        undecomposed_starter,
2666        pending_slice,
2667        len_utf8,
2668    );
2669
2670    composing_normalize_to!(
2671        /// Normalize a slice of potentially-invalid UTF-16 into a `Write16` sink.
2672        ///
2673        /// Unpaired surrogates are mapped to the REPLACEMENT CHARACTER
2674        /// before normalizing.
2675        ///
2676        /// ✨ *Enabled with the `utf16_iter` Cargo feature.*
2677        #[cfg(feature = "utf16_iter")]
2678        ,
2679        normalize_utf16_to,
2680        write16::Write16,
2681        &[u16],
2682        {
2683            sink.size_hint(text.len())?;
2684        },
2685        false,
2686        as_slice,
2687        {
2688            // This loop is only broken out of as goto forward and only as release-build recovery from
2689            // detecting an internal bug without panic. (In debug builds, internal bugs panic instead.)
2690            #[expect(clippy::never_loop)]
2691            'fastwrap: loop {
2692                // Commented out `code_unit_iter` and used `ptr` and `end` to
2693                // work around https://github.com/rust-lang/rust/issues/144684 .
2694                //
2695                // let mut code_unit_iter = composition.decomposition.delegate.as_slice().iter();
2696                let delegate_as_slice = composition.decomposition.delegate.as_slice();
2697                let mut ptr: *const u16 = delegate_as_slice.as_ptr();
2698                // SAFETY: materializing a pointer immediately past the end of an
2699                // allocation is OK.
2700                let end: *const u16 = unsafe { ptr.add(delegate_as_slice.len()) };
2701
2702                'fast: loop {
2703                    // if let Some(&upcoming_code_unit) = code_unit_iter.next() {
2704                    if ptr != end {
2705                        // SAFETY: We just checked that `ptr` has not reached `end`.
2706                        // `ptr` always advances by one, and we always have a check
2707                        // per advancement.
2708                        let upcoming_code_unit = unsafe { *ptr };
2709                        // SAFETY: Since `ptr` hadn't reached `end`, yet, advancing
2710                        // by one points to the same allocation or to immediately
2711                        // after, which is OK.
2712                        ptr = unsafe { ptr.add(1) };
2713
2714                        let mut upcoming32 = u32::from(upcoming_code_unit); // may be surrogate
2715                        // The performance of what logically is supposed to be this
2716                        // branch is somewhat brittle and what LLVM ends up doing
2717                        // that affects the performance of what's logically about this
2718                        // decision can swing to double/halve the throughput for Basic
2719                        // Latin in ways that are completely unintuitive. Basically _any_
2720                        // change to _any_ code that participates in how LLVM sees the
2721                        // code around here can make the perf fall over. In seems that
2722                        // manually annotating this branch as likely has worse effects
2723                        // on non-Basic-Latin input that the case where LLVM just happens to
2724                        // do the right thing.
2725                        //
2726                        // What happens with this branch may depend on what sink type
2727                        // this code is monomorphized over.
2728                        //
2729                        // What a terrible sink of developer time!
2730                        if upcoming32 < composition_passthrough_bound {
2731                            // No need for surrogate or U+FFFD check, because
2732                            // `composition_passthrough_bound` cannot be higher than
2733                            // U+0300.
2734                            // Fast-track succeeded!
2735                            continue 'fast;
2736                        }
2737                        // We might be doing a trie lookup by surrogate. Surrogates get
2738                        // a decomposition to U+FFFD.
2739                        let mut trie_value = composition.decomposition.trie.get16(upcoming_code_unit);
2740                        if potential_passthrough_and_cannot_combine_backwards_impl(trie_value) {
2741                            // Can't combine backwards, hence a plain (non-backwards-combining)
2742                            // starter albeit past `composition_passthrough_bound`
2743
2744                            // Fast-track succeeded!
2745                            continue 'fast;
2746                        }
2747
2748                        // We might now be looking at a surrogate.
2749                        // The loop is only broken out of as goto forward
2750                        #[expect(clippy::never_loop)]
2751                        'surrogateloop: loop {
2752                            // The `likely` annotations _below_ exist to make the code _above_
2753                            // go faster!
2754                            let surrogate_base = upcoming32.wrapping_sub(0xD800);
2755                            if likely(surrogate_base > (0xDFFF - 0xD800)) {
2756                                // Not surrogate
2757                                break 'surrogateloop;
2758                            }
2759                            if likely(surrogate_base <= (0xDBFF - 0xD800)) {
2760                                // let iter_backup = code_unit_iter.clone();
2761                                // if let Some(&low) = code_unit_iter.next() {
2762                                if ptr != end {
2763                                    // SAFETY: We just checked that `ptr` has not reached `end`.
2764                                    // `ptr` always advances by one, and we always have a check
2765                                    // per advancement.
2766                                    let low = unsafe { *ptr };
2767                                    if likely(in_inclusive_range16(low, 0xDC00, 0xDFFF)) {
2768                                        // SAFETY: Since `ptr` hadn't reached `end`, yet, advancing
2769                                        // by one points to the same allocation or to immediately
2770                                        // after, which is OK.
2771                                        ptr = unsafe { ptr.add(1) };
2772
2773                                        upcoming32 = (upcoming32 << 10) + u32::from(low)
2774                                            - (((0xD800u32 << 10) - 0x10000u32) + 0xDC00u32);
2775                                        // Successfully-paired surrogate. Read from the trie again.
2776                                        trie_value = {
2777                                            // Semantically, this bit of conditional compilation makes no sense.
2778                                            // The purpose is to keep LLVM seeing the untyped trie case the way
2779                                            // it did before so as not to regress the performance of the untyped
2780                                            // case due to unintuitive optimizer effects. If you care about the
2781                                            // perf of the untyped trie case and have better ideas, please try
2782                                            // something better.
2783                                            #[cfg(not(icu4x_unstable_fast_trie_only))]
2784                                            {composition.decomposition.trie.get32(upcoming32)}
2785                                            #[cfg(icu4x_unstable_fast_trie_only)]
2786                                            {composition.decomposition.trie.get32_supplementary(upcoming32)}
2787                                        };
2788                                        if likely(potential_passthrough_and_cannot_combine_backwards_impl(trie_value)) {
2789                                            // Fast-track succeeded!
2790                                            continue 'fast;
2791                                        }
2792                                        break 'surrogateloop;
2793                                    // } else {
2794                                    //     code_unit_iter = iter_backup;
2795                                    }
2796                                }
2797                            }
2798                            // unpaired surrogate
2799                            upcoming32 = 0xFFFD; // Safe value for `char::from_u32_unchecked` and matches later potential error check.
2800                            // trie_value already holds a decomposition to U+FFFD.
2801                            debug_assert_eq!(trie_value, NON_ROUND_TRIP_MARKER | BACKWARD_COMBINING_MARKER | 0xFFFD);
2802                            break 'surrogateloop;
2803                        }
2804
2805                        // SAFETY: upcoming32 can no longer be a surrogate.
2806                        let upcoming = unsafe { char::from_u32_unchecked(upcoming32) };
2807                        let upcoming_with_trie_value = CharacterAndTrieValue::new(upcoming, trie_value);
2808                        // We need to fall off the fast path.
2809                        composition.decomposition.pending = Some(upcoming_with_trie_value);
2810                        let Some(consumed_so_far_slice) = pending_slice.get(..pending_slice.len() -
2811                            // code_unit_iter.as_slice().len()
2812                            // SAFETY: `ptr` and `end` have been derived from the same allocation
2813                            // and `ptr` is never greater than `end`.
2814                            unsafe { end.offset_from(ptr) as usize }
2815                            - upcoming.len_utf16()) else {
2816                            // If we ever come here, it's a bug, but let's avoid panic code paths in release builds.
2817                            debug_assert!(false);
2818                            // Throw away the results of the fast path.
2819                            break 'fastwrap;
2820                        };
2821                        let mut consumed_so_far = consumed_so_far_slice.chars();
2822                        let Some(c_from_back) = consumed_so_far.next_back() else {
2823                            // If we ever come here, it's a bug, but let's avoid panic code paths in release builds.
2824                            debug_assert!(false);
2825                            // Throw away the results of the fast path.
2826                            break 'fastwrap;
2827                        };
2828                        // TODO: If the previous character was below the passthrough bound,
2829                        // we really need to read from the trie. Otherwise, we could maintain
2830                        // the most-recent trie value. Need to measure what's more expensive:
2831                        // Remembering the trie value on each iteration or re-reading the
2832                        // last one after the fast-track run.
2833                        undecomposed_starter = composition.decomposition.attach_trie_value(c_from_back);
2834                        sink.write_slice(consumed_so_far.as_slice())?;
2835                        break 'fast;
2836                    }
2837                    // End of stream
2838                    sink.write_slice(pending_slice)?;
2839                    return Ok(());
2840                }
2841                // Sync the main iterator
2842                // composition.decomposition.delegate = code_unit_iter.as_slice().chars();
2843                // SAFETY: `ptr` and `end` have been derive from the same allocation
2844                // and `ptr` is never greater than `end`.
2845                composition.decomposition.delegate = unsafe { core::slice::from_raw_parts(ptr, end.offset_from(ptr) as usize) }.chars();
2846                break 'fastwrap;
2847            }
2848        },
2849        text,
2850        sink,
2851        composition,
2852        composition_passthrough_bound,
2853        undecomposed_starter,
2854        pending_slice,
2855        len_utf16,
2856    );
2857}
2858
2859/// A normalizer for performing composing normalization.
2860#[derive(Debug)]
2861pub struct ComposingNormalizer {
2862    decomposing_normalizer: DecomposingNormalizer,
2863    canonical_compositions: DataPayload<NormalizerNfcV1>,
2864}
2865
2866impl ComposingNormalizer {
2867    /// Constructs a borrowed version of this type for more efficient querying.
2868    pub fn as_borrowed(&self) -> ComposingNormalizerBorrowed<'_> {
2869        ComposingNormalizerBorrowed {
2870            decomposing_normalizer: self.decomposing_normalizer.as_borrowed(),
2871            canonical_compositions: self.canonical_compositions.get(),
2872        }
2873    }
2874
2875    /// NFC constructor using compiled data.
2876    ///
2877    /// ✨ *Enabled with the `compiled_data` Cargo feature.*
2878    ///
2879    /// [📚 Help choosing a constructor](icu_provider::constructors)
2880    #[cfg(feature = "compiled_data")]
2881    pub const fn new_nfc() -> ComposingNormalizerBorrowed<'static> {
2882        ComposingNormalizerBorrowed::new_nfc()
2883    }
2884
2885    icu_provider::gen_buffer_data_constructors!(
2886        () -> error: DataError,
2887        functions: [
2888            new_nfc: skip,
2889            try_new_nfc_with_buffer_provider,
2890            try_new_nfc_unstable,
2891            Self,
2892        ]
2893    );
2894
2895    #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_nfc)]
2896    pub fn try_new_nfc_unstable<D>(provider: &D) -> Result<Self, DataError>
2897    where
2898        D: DataProvider<NormalizerNfdDataV1>
2899            + DataProvider<NormalizerNfdTablesV1>
2900            + DataProvider<NormalizerNfcV1>
2901            + ?Sized,
2902    {
2903        let decomposing_normalizer = DecomposingNormalizer::try_new_nfd_unstable(provider)?;
2904
2905        let canonical_compositions: DataPayload<NormalizerNfcV1> =
2906            provider.load(Default::default())?.payload;
2907
2908        Ok(ComposingNormalizer {
2909            decomposing_normalizer,
2910            canonical_compositions,
2911        })
2912    }
2913
2914    /// NFKC constructor using compiled data.
2915    ///
2916    /// ✨ *Enabled with the `compiled_data` Cargo feature.*
2917    ///
2918    /// [📚 Help choosing a constructor](icu_provider::constructors)
2919    #[cfg(feature = "compiled_data")]
2920    pub const fn new_nfkc() -> ComposingNormalizerBorrowed<'static> {
2921        ComposingNormalizerBorrowed::new_nfkc()
2922    }
2923
2924    icu_provider::gen_buffer_data_constructors!(
2925        () -> error: DataError,
2926        functions: [
2927            new_nfkc: skip,
2928            try_new_nfkc_with_buffer_provider,
2929            try_new_nfkc_unstable,
2930            Self,
2931        ]
2932    );
2933
2934    #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_nfkc)]
2935    pub fn try_new_nfkc_unstable<D>(provider: &D) -> Result<Self, DataError>
2936    where
2937        D: DataProvider<NormalizerNfkdDataV1>
2938            + DataProvider<NormalizerNfdTablesV1>
2939            + DataProvider<NormalizerNfkdTablesV1>
2940            + DataProvider<NormalizerNfcV1>
2941            + ?Sized,
2942    {
2943        let decomposing_normalizer = DecomposingNormalizer::try_new_nfkd_unstable(provider)?;
2944
2945        let canonical_compositions: DataPayload<NormalizerNfcV1> =
2946            provider.load(Default::default())?.payload;
2947
2948        Ok(ComposingNormalizer {
2949            decomposing_normalizer,
2950            canonical_compositions,
2951        })
2952    }
2953
2954    #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_uts46)]
2955    pub(crate) fn try_new_uts46_unstable<D>(provider: &D) -> Result<Self, DataError>
2956    where
2957        D: DataProvider<NormalizerUts46DataV1>
2958            + DataProvider<NormalizerNfdTablesV1>
2959            + DataProvider<NormalizerNfkdTablesV1>
2960            // UTS 46 tables merged into CompatibilityDecompositionTablesV1
2961            + DataProvider<NormalizerNfcV1>
2962            + ?Sized,
2963    {
2964        let decomposing_normalizer =
2965            DecomposingNormalizer::try_new_uts46_decomposed_unstable(provider)?;
2966
2967        let canonical_compositions: DataPayload<NormalizerNfcV1> =
2968            provider.load(Default::default())?.payload;
2969
2970        Ok(ComposingNormalizer {
2971            decomposing_normalizer,
2972            canonical_compositions,
2973        })
2974    }
2975}
2976
2977#[cfg(feature = "utf16_iter")]
2978struct IsNormalizedSinkUtf16<'a> {
2979    expect: &'a [u16],
2980}
2981
2982#[cfg(feature = "utf16_iter")]
2983impl<'a> IsNormalizedSinkUtf16<'a> {
2984    pub fn new(slice: &'a [u16]) -> Self {
2985        IsNormalizedSinkUtf16 { expect: slice }
2986    }
2987    pub fn remaining_len(&self) -> usize {
2988        self.expect.len()
2989    }
2990}
2991
2992#[cfg(feature = "utf16_iter")]
2993impl write16::Write16 for IsNormalizedSinkUtf16<'_> {
2994    fn write_slice(&mut self, s: &[u16]) -> core::fmt::Result {
2995        // We know that if we get a slice, it's a pass-through,
2996        // so we can compare addresses. Indexing is OK, because
2997        // an indexing failure would be a code bug rather than
2998        // an input or data issue.
2999        #[expect(clippy::indexing_slicing)]
3000        if core::ptr::eq(s.as_ptr(), self.expect.as_ptr()) {
3001            self.expect = &self.expect[s.len()..];
3002            Ok(())
3003        } else {
3004            Err(core::fmt::Error {})
3005        }
3006    }
3007
3008    fn write_char(&mut self, c: char) -> core::fmt::Result {
3009        let mut iter = self.expect.chars();
3010        if iter.next() == Some(c) {
3011            self.expect = iter.as_slice();
3012            Ok(())
3013        } else {
3014            Err(core::fmt::Error {})
3015        }
3016    }
3017}
3018
3019#[cfg(feature = "utf8_iter")]
3020struct IsNormalizedSinkUtf8<'a> {
3021    expect: &'a [u8],
3022}
3023
3024#[cfg(feature = "utf8_iter")]
3025impl<'a> IsNormalizedSinkUtf8<'a> {
3026    pub fn new(slice: &'a [u8]) -> Self {
3027        IsNormalizedSinkUtf8 { expect: slice }
3028    }
3029    pub fn remaining_len(&self) -> usize {
3030        self.expect.len()
3031    }
3032}
3033
3034#[cfg(feature = "utf8_iter")]
3035impl core::fmt::Write for IsNormalizedSinkUtf8<'_> {
3036    fn write_str(&mut self, s: &str) -> core::fmt::Result {
3037        // We know that if we get a slice, it's a pass-through,
3038        // so we can compare addresses. Indexing is OK, because
3039        // an indexing failure would be a code bug rather than
3040        // an input or data issue.
3041        #[expect(clippy::indexing_slicing)]
3042        if core::ptr::eq(s.as_ptr(), self.expect.as_ptr()) {
3043            self.expect = &self.expect[s.len()..];
3044            Ok(())
3045        } else {
3046            Err(core::fmt::Error {})
3047        }
3048    }
3049
3050    fn write_char(&mut self, c: char) -> core::fmt::Result {
3051        let mut iter = self.expect.chars();
3052        if iter.next() == Some(c) {
3053            self.expect = iter.as_slice();
3054            Ok(())
3055        } else {
3056            Err(core::fmt::Error {})
3057        }
3058    }
3059}
3060
3061struct IsNormalizedSinkStr<'a> {
3062    expect: &'a str,
3063}
3064
3065impl<'a> IsNormalizedSinkStr<'a> {
3066    pub fn new(slice: &'a str) -> Self {
3067        IsNormalizedSinkStr { expect: slice }
3068    }
3069    pub fn remaining_len(&self) -> usize {
3070        self.expect.len()
3071    }
3072}
3073
3074impl core::fmt::Write for IsNormalizedSinkStr<'_> {
3075    fn write_str(&mut self, s: &str) -> core::fmt::Result {
3076        // We know that if we get a slice, it's a pass-through,
3077        // so we can compare addresses. Indexing is OK, because
3078        // an indexing failure would be a code bug rather than
3079        // an input or data issue.
3080        if core::ptr::eq(s.as_ptr(), self.expect.as_ptr()) {
3081            self.expect = &self.expect[s.len()..];
3082            Ok(())
3083        } else {
3084            Err(core::fmt::Error {})
3085        }
3086    }
3087
3088    fn write_char(&mut self, c: char) -> core::fmt::Result {
3089        let mut iter = self.expect.chars();
3090        if iter.next() == Some(c) {
3091            self.expect = iter.as_str();
3092            Ok(())
3093        } else {
3094            Err(core::fmt::Error {})
3095        }
3096    }
3097}
icu_normalizer/lib.rs

icu_normalizer/
lib.rs