icu_normalizer/lib.rs
1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5// https://github.com/unicode-org/icu4x/blob/main/documents/process/boilerplate.md#library-annotations
6#![cfg_attr(not(any(test, doc)), no_std)]
7#![cfg_attr(
8 not(test),
9 deny(
10 clippy::indexing_slicing,
11 clippy::unwrap_used,
12 clippy::expect_used,
13 clippy::panic,
14 clippy::exhaustive_structs,
15 clippy::exhaustive_enums,
16 clippy::trivially_copy_pass_by_ref,
17 missing_debug_implementations,
18 )
19)]
20#![warn(missing_docs)]
21
22//! Normalizing text into Unicode Normalization Forms.
23//!
24//! This module is published as its own crate ([`icu_normalizer`](https://docs.rs/icu_normalizer/latest/icu_normalizer/))
25//! and as part of the [`icu`](https://docs.rs/icu/latest/icu/) crate. See the latter for more details on the ICU4X project.
26//!
27//! # Functionality
28//!
29//! The top level of the crate provides normalization of input into the four normalization forms defined in [UAX #15: Unicode
30//! Normalization Forms](https://www.unicode.org/reports/tr15/): NFC, NFD, NFKC, and NFKD.
31//!
32//! Three kinds of contiguous inputs are supported: known-well-formed UTF-8 (`&str`), potentially-not-well-formed UTF-8,
33//! and potentially-not-well-formed UTF-16. Additionally, an iterator over `char` can be wrapped in a normalizing iterator.
34//!
35//! The `uts46` module provides the combination of mapping and normalization operations for [UTS #46: Unicode IDNA
36//! Compatibility Processing](https://www.unicode.org/reports/tr46/). This functionality is not meant to be used by
37//! applications directly. Instead, it is meant as a building block for a full implementation of UTS #46, such as the
38//! [`idna`](https://docs.rs/idna/latest/idna/) crate.
39//!
40//! The `properties` module provides the non-recursive canonical decomposition operation on a per `char` basis and
41//! the canonical compositon operation given two `char`s. It also provides access to the Canonical Combining Class
42//! property. These operations are primarily meant for [HarfBuzz](https://harfbuzz.github.io/) via the
43//! [`icu_harfbuzz`](https://docs.rs/icu_harfbuzz/latest/icu_harfbuzz/) crate.
44//!
45//! Notably, this normalizer does _not_ provide the normalization “quick check” that can result in “maybe” in
46//! addition to “yes” and “no”. The normalization checks provided by this crate always give a definitive
47//! non-“maybe” answer.
48//!
49//! # Examples
50//!
51//! ```
52//! let nfc = icu_normalizer::ComposingNormalizerBorrowed::new_nfc();
53//! assert_eq!(nfc.normalize("a\u{0308}"), "ä");
54//! assert!(nfc.is_normalized("ä"));
55//!
56//! let nfd = icu_normalizer::DecomposingNormalizerBorrowed::new_nfd();
57//! assert_eq!(nfd.normalize("ä"), "a\u{0308}");
58//! assert!(!nfd.is_normalized("ä"));
59//! ```
60
61extern crate alloc;
62
63// TODO: The plan is to replace
64// `#[cfg(not(icu4x_unstable_fast_trie_only))]`
65// with
66// `#[cfg(feature = "serde")]`
67// and
68// `#[cfg(icu4x_unstable_fast_trie_only)]`
69// with
70// `#[cfg(not(feature = "serde"))]`
71//
72// Before doing so:
73// * The type of the UTS 46 trie needs to be
74// disentangled from the type of the NFD/NFKD tries.
75// This will involve a more generic iterator hidden
76// inside the public iterator types.
77// * datagen needs to emit fast-mode tries for the
78// NFD and NFKD tries.
79// * The markers and possibly the data struct type
80// for NFD and NFKD need to be revised per policy.
81
82#[cfg(not(icu4x_unstable_fast_trie_only))]
83type Trie<'trie> = CodePointTrie<'trie, u32>;
84
85#[cfg(icu4x_unstable_fast_trie_only)]
86type Trie<'trie> = FastCodePointTrie<'trie, u32>;
87
88// We don't depend on icu_properties to minimize deps, but we want to be able
89// to ensure we're using the right CCC values
90macro_rules! ccc {
91 ($name:ident, $num:expr) => {
92 const {
93 #[cfg(feature = "icu_properties")]
94 if icu_properties::props::CanonicalCombiningClass::$name.to_icu4c_value() != $num {
95 panic!("icu_normalizer has incorrect ccc values")
96 }
97 CanonicalCombiningClass::from_icu4c_value($num)
98 }
99 };
100}
101
102pub mod properties;
103pub mod provider;
104pub mod uts46;
105
106use crate::provider::CanonicalCompositions;
107use crate::provider::DecompositionData;
108use crate::provider::NormalizerNfdDataV1;
109use crate::provider::NormalizerNfkdDataV1;
110use crate::provider::NormalizerUts46DataV1;
111use alloc::borrow::Cow;
112use alloc::string::String;
113use core::char::REPLACEMENT_CHARACTER;
114use icu_collections::char16trie::Char16Trie;
115use icu_collections::char16trie::Char16TrieIterator;
116use icu_collections::char16trie::TrieResult;
117#[cfg(not(icu4x_unstable_fast_trie_only))]
118use icu_collections::codepointtrie::CodePointTrie;
119#[cfg(icu4x_unstable_fast_trie_only)]
120use icu_collections::codepointtrie::FastCodePointTrie;
121#[cfg(icu4x_unstable_fast_trie_only)]
122use icu_collections::codepointtrie::TypedCodePointTrie;
123#[cfg(feature = "icu_properties")]
124use icu_properties::props::CanonicalCombiningClass;
125use icu_provider::prelude::*;
126use provider::DecompositionTables;
127use provider::NormalizerNfcV1;
128use provider::NormalizerNfdTablesV1;
129use provider::NormalizerNfkdTablesV1;
130use smallvec::SmallVec;
131#[cfg(feature = "utf16_iter")]
132use utf16_iter::Utf16CharsEx;
133#[cfg(feature = "utf8_iter")]
134use utf8_iter::Utf8CharsEx;
135use zerovec::{zeroslice, ZeroSlice};
136
137// The optimizations in the area where `likely` is used
138// are extremely brittle. `likely` is useful in the typed-trie
139// case on the UTF-16 fast path, but in order not to disturb
140// the untyped-trie case on the UTF-16 fast path, make the
141// annotations no-ops in the untyped-trie case.
142
143// `cold_path` and `likely` come from
144// https://github.com/rust-lang/hashbrown/commit/64bd7db1d1b148594edfde112cdb6d6260e2cfc3 .
145// See https://github.com/rust-lang/hashbrown/commit/64bd7db1d1b148594edfde112cdb6d6260e2cfc3#commitcomment-164768806
146// for permission to relicense under Unicode-3.0.
147
148#[cfg(all(icu4x_unstable_fast_trie_only, feature = "utf16_iter"))]
149#[inline(always)]
150#[cold]
151fn cold_path() {}
152
153#[cfg(all(icu4x_unstable_fast_trie_only, feature = "utf16_iter"))]
154#[inline(always)]
155pub(crate) fn likely(b: bool) -> bool {
156 if b {
157 true
158 } else {
159 cold_path();
160 false
161 }
162}
163
164// End import from https://github.com/rust-lang/hashbrown/commit/64bd7db1d1b148594edfde112cdb6d6260e2cfc3 .
165
166/// No-op for typed trie case.
167#[cfg(all(not(icu4x_unstable_fast_trie_only), feature = "utf16_iter"))]
168#[inline(always)]
169fn likely(b: bool) -> bool {
170 b
171}
172
173/// This type exists as a shim for icu_properties CanonicalCombiningClass when the crate is disabled
174/// It should not be exposed to users.
175#[cfg(not(feature = "icu_properties"))]
176#[derive(Copy, Clone, Eq, PartialEq, PartialOrd, Ord)]
177struct CanonicalCombiningClass(pub(crate) u8);
178
179#[cfg(not(feature = "icu_properties"))]
180impl CanonicalCombiningClass {
181 const fn from_icu4c_value(v: u8) -> Self {
182 Self(v)
183 }
184 const fn to_icu4c_value(self) -> u8 {
185 self.0
186 }
187}
188
189const CCC_NOT_REORDERED: CanonicalCombiningClass = ccc!(NotReordered, 0);
190const CCC_ABOVE: CanonicalCombiningClass = ccc!(Above, 230);
191
192/// Treatment of the ignorable marker (0xFFFFFFFF) in data.
193#[derive(Debug, PartialEq, Eq)]
194enum IgnorableBehavior {
195 /// 0xFFFFFFFF in data is not supported.
196 Unsupported,
197 /// Ignorables are ignored.
198 Ignored,
199 /// Ignorables are treated as singleton decompositions
200 /// to the REPLACEMENT CHARACTER.
201 ReplacementCharacter,
202}
203
204/// Marker for UTS 46 ignorables.
205///
206/// See trie-value-format.md
207const IGNORABLE_MARKER: u32 = 0xFFFFFFFF;
208
209/// Marker that the decomposition does not round trip via NFC.
210///
211/// See trie-value-format.md
212const NON_ROUND_TRIP_MARKER: u32 = 1 << 30;
213
214/// Marker that the first character of the decomposition
215/// can combine backwards.
216///
217/// See trie-value-format.md
218const BACKWARD_COMBINING_MARKER: u32 = 1 << 31;
219
220/// Mask for the bits have to be zero for this to be a BMP
221/// singleton decomposition, or value baked into the surrogate
222/// range.
223///
224/// See trie-value-format.md
225const HIGH_ZEROS_MASK: u32 = 0x3FFF0000;
226
227/// Mask for the bits have to be zero for this to be a complex
228/// decomposition.
229///
230/// See trie-value-format.md
231const LOW_ZEROS_MASK: u32 = 0xFFE0;
232
233/// Checks if a trie value carries a (non-zero) canonical
234/// combining class.
235///
236/// See trie-value-format.md
237fn trie_value_has_ccc(trie_value: u32) -> bool {
238 (trie_value & 0x3FFFFE00) == 0xD800
239}
240
241/// Checks if the trie signifies a special non-starter decomposition.
242///
243/// See trie-value-format.md
244fn trie_value_indicates_special_non_starter_decomposition(trie_value: u32) -> bool {
245 (trie_value & 0x3FFFFF00) == 0xD900
246}
247
248/// Checks if a trie value signifies a character whose decomposition
249/// starts with a non-starter.
250///
251/// See trie-value-format.md
252fn decomposition_starts_with_non_starter(trie_value: u32) -> bool {
253 trie_value_has_ccc(trie_value)
254}
255
256/// Extracts a canonical combining class (possibly zero) from a trie value.
257///
258/// See trie-value-format.md
259fn ccc_from_trie_value(trie_value: u32) -> CanonicalCombiningClass {
260 if trie_value_has_ccc(trie_value) {
261 CanonicalCombiningClass::from_icu4c_value(trie_value as u8)
262 } else {
263 CCC_NOT_REORDERED
264 }
265}
266
267/// The tail (everything after the first character) of the NFKD form U+FDFA
268/// as 16-bit units.
269static FDFA_NFKD: [u16; 17] = [
270 0x644, 0x649, 0x20, 0x627, 0x644, 0x644, 0x647, 0x20, 0x639, 0x644, 0x64A, 0x647, 0x20, 0x648,
271 0x633, 0x644, 0x645,
272];
273
274/// Marker value for U+FDFA in NFKD. (Unified with Hangul syllable marker,
275/// but they differ by `NON_ROUND_TRIP_MARKER`.)
276///
277/// See trie-value-format.md
278const FDFA_MARKER: u16 = 1;
279
280// These constants originate from page 143 of Unicode 14.0
281/// Syllable base
282const HANGUL_S_BASE: u32 = 0xAC00;
283/// Lead jamo base
284const HANGUL_L_BASE: u32 = 0x1100;
285/// Vowel jamo base
286const HANGUL_V_BASE: u32 = 0x1161;
287/// Trail jamo base (deliberately off by one to account for the absence of a trail)
288const HANGUL_T_BASE: u32 = 0x11A7;
289/// Lead jamo count
290const HANGUL_L_COUNT: u32 = 19;
291/// Vowel jamo count
292const HANGUL_V_COUNT: u32 = 21;
293/// Trail jamo count (deliberately off by one to account for the absence of a trail)
294const HANGUL_T_COUNT: u32 = 28;
295/// Vowel jamo count times trail jamo count
296const HANGUL_N_COUNT: u32 = 588;
297/// Syllable count
298const HANGUL_S_COUNT: u32 = 11172;
299
300/// One past the conjoining jamo block
301const HANGUL_JAMO_LIMIT: u32 = 0x1200;
302
303/// If `opt` is `Some`, unwrap it. If `None`, panic if debug assertions
304/// are enabled and return `default` if debug assertions are not enabled.
305///
306/// Use this only if the only reason why `opt` could be `None` is bogus
307/// data from the provider.
308#[inline(always)]
309fn unwrap_or_gigo<T>(opt: Option<T>, default: T) -> T {
310 if let Some(val) = opt {
311 val
312 } else {
313 // GIGO case
314 debug_assert!(false);
315 default
316 }
317}
318
319/// Convert a `u32` _obtained from data provider data_ to `char`.
320#[inline(always)]
321fn char_from_u32(u: u32) -> char {
322 unwrap_or_gigo(core::char::from_u32(u), REPLACEMENT_CHARACTER)
323}
324
325/// Convert a `u16` _obtained from data provider data_ to `char`.
326#[inline(always)]
327fn char_from_u16(u: u16) -> char {
328 char_from_u32(u32::from(u))
329}
330
331const EMPTY_U16: &ZeroSlice<u16> = zeroslice![];
332
333const EMPTY_CHAR: &ZeroSlice<char> = zeroslice![];
334
335#[inline(always)]
336fn in_inclusive_range(c: char, start: char, end: char) -> bool {
337 u32::from(c).wrapping_sub(u32::from(start)) <= (u32::from(end) - u32::from(start))
338}
339
340#[inline(always)]
341#[cfg(feature = "utf16_iter")]
342fn in_inclusive_range16(u: u16, start: u16, end: u16) -> bool {
343 u.wrapping_sub(start) <= (end - start)
344}
345
346/// Performs canonical composition (including Hangul) on a pair of
347/// characters or returns `None` if these characters don't compose.
348/// Composition exclusions are taken into account.
349#[inline]
350fn compose(iter: Char16TrieIterator, starter: char, second: char) -> Option<char> {
351 let v = u32::from(second).wrapping_sub(HANGUL_V_BASE);
352 if v >= HANGUL_JAMO_LIMIT - HANGUL_V_BASE {
353 return compose_non_hangul(iter, starter, second);
354 }
355 if v < HANGUL_V_COUNT {
356 let l = u32::from(starter).wrapping_sub(HANGUL_L_BASE);
357 if l < HANGUL_L_COUNT {
358 let lv = l * HANGUL_N_COUNT + v * HANGUL_T_COUNT;
359 // Safe, because the inputs are known to be in range.
360 return Some(unsafe { char::from_u32_unchecked(HANGUL_S_BASE + lv) });
361 }
362 return None;
363 }
364 if in_inclusive_range(second, '\u{11A8}', '\u{11C2}') {
365 let lv = u32::from(starter).wrapping_sub(HANGUL_S_BASE);
366 if lv < HANGUL_S_COUNT && lv % HANGUL_T_COUNT == 0 {
367 let lvt = lv + (u32::from(second) - HANGUL_T_BASE);
368 // Safe, because the inputs are known to be in range.
369 return Some(unsafe { char::from_u32_unchecked(HANGUL_S_BASE + lvt) });
370 }
371 }
372 None
373}
374
375/// Performs (non-Hangul) canonical composition on a pair of characters
376/// or returns `None` if these characters don't compose. Composition
377/// exclusions are taken into account.
378fn compose_non_hangul(mut iter: Char16TrieIterator, starter: char, second: char) -> Option<char> {
379 // To make the trie smaller, the pairs are stored second character first.
380 // Given how this method is used in ways where it's known that `second`
381 // is or isn't a starter. We could potentially split the trie into two
382 // tries depending on whether `second` is a starter.
383 match iter.next(second) {
384 TrieResult::NoMatch => None,
385 TrieResult::NoValue => match iter.next(starter) {
386 TrieResult::NoMatch => None,
387 TrieResult::FinalValue(i) => {
388 if let Some(c) = char::from_u32(i as u32) {
389 Some(c)
390 } else {
391 // GIGO case
392 debug_assert!(false);
393 None
394 }
395 }
396 TrieResult::NoValue | TrieResult::Intermediate(_) => {
397 // GIGO case
398 debug_assert!(false);
399 None
400 }
401 },
402 TrieResult::FinalValue(_) | TrieResult::Intermediate(_) => {
403 // GIGO case
404 debug_assert!(false);
405 None
406 }
407 }
408}
409
410/// See trie-value-format.md
411#[inline(always)]
412fn starter_and_decomposes_to_self_impl(trie_val: u32) -> bool {
413 // The REPLACEMENT CHARACTER has `NON_ROUND_TRIP_MARKER` set,
414 // and this function needs to ignore that.
415 (trie_val & !(BACKWARD_COMBINING_MARKER | NON_ROUND_TRIP_MARKER)) == 0
416}
417
418/// See trie-value-format.md
419#[inline(always)]
420fn potential_passthrough_and_cannot_combine_backwards_impl(trie_val: u32) -> bool {
421 (trie_val & (NON_ROUND_TRIP_MARKER | BACKWARD_COMBINING_MARKER)) == 0
422}
423
424/// Struct for holding together a character and the value
425/// looked up for it from the NFD trie in a more explicit
426/// way than an anonymous pair.
427/// Also holds a flag about the supplementary-trie provenance.
428#[derive(Debug, PartialEq, Eq)]
429struct CharacterAndTrieValue {
430 character: char,
431 /// See trie-value-format.md
432 trie_val: u32,
433}
434
435impl CharacterAndTrieValue {
436 #[inline(always)]
437 pub fn new(c: char, trie_value: u32) -> Self {
438 CharacterAndTrieValue {
439 character: c,
440 trie_val: trie_value,
441 }
442 }
443
444 #[inline(always)]
445 pub fn starter_and_decomposes_to_self(&self) -> bool {
446 starter_and_decomposes_to_self_impl(self.trie_val)
447 }
448
449 /// See trie-value-format.md
450 #[inline(always)]
451 #[cfg(feature = "utf8_iter")]
452 pub fn starter_and_decomposes_to_self_except_replacement(&self) -> bool {
453 // This intentionally leaves `NON_ROUND_TRIP_MARKER` in the value
454 // to be compared with zero. U+FFFD has that flag set despite really
455 // being being round-tripping in order to make UTF-8 errors
456 // ineligible for passthrough.
457 (self.trie_val & !BACKWARD_COMBINING_MARKER) == 0
458 }
459
460 /// See trie-value-format.md
461 #[inline(always)]
462 pub fn can_combine_backwards(&self) -> bool {
463 (self.trie_val & BACKWARD_COMBINING_MARKER) != 0
464 }
465 /// See trie-value-format.md
466 #[inline(always)]
467 pub fn potential_passthrough(&self) -> bool {
468 (self.trie_val & NON_ROUND_TRIP_MARKER) == 0
469 }
470 /// See trie-value-format.md
471 #[inline(always)]
472 pub fn potential_passthrough_and_cannot_combine_backwards(&self) -> bool {
473 potential_passthrough_and_cannot_combine_backwards_impl(self.trie_val)
474 }
475}
476
477/// Pack a `char` and a `CanonicalCombiningClass` in
478/// 32 bits (the former in the lower 24 bits and the
479/// latter in the high 8 bits). The latter can be
480/// initialized to 0xFF upon creation, in which case
481/// it can be actually set later by calling
482/// `set_ccc_from_trie_if_not_already_set`. This is
483/// a micro optimization to avoid the Canonical
484/// Combining Class trie lookup when there is only
485/// one combining character in a sequence. This type
486/// is intentionally non-`Copy` to get compiler help
487/// in making sure that the class is set on the
488/// instance on which it is intended to be set
489/// and not on a temporary copy.
490///
491/// Note that 0xFF is won't be assigned to an actual
492/// canonical combining class per definition D104
493/// in The Unicode Standard.
494//
495// NOTE: The Pernosco debugger has special knowledge
496// of this struct. Please do not change the bit layout
497// or the crate-module-qualified name of this struct
498// without coordination.
499#[derive(Debug)]
500struct CharacterAndClass(u32);
501
502impl CharacterAndClass {
503 pub fn new(c: char, ccc: CanonicalCombiningClass) -> Self {
504 CharacterAndClass(u32::from(c) | (u32::from(ccc.to_icu4c_value()) << 24))
505 }
506 pub fn new_with_placeholder(c: char) -> Self {
507 CharacterAndClass(u32::from(c) | ((0xFF) << 24))
508 }
509 pub fn new_with_trie_value(c_tv: CharacterAndTrieValue) -> Self {
510 Self::new(c_tv.character, ccc_from_trie_value(c_tv.trie_val))
511 }
512 pub fn new_starter(c: char) -> Self {
513 CharacterAndClass(u32::from(c))
514 }
515 /// This method must exist for Pernosco to apply its special rendering.
516 /// Also, this must not be dead code!
517 pub fn character(&self) -> char {
518 // Safe, because the low 24 bits came from a `char`
519 // originally.
520 unsafe { char::from_u32_unchecked(self.0 & 0xFFFFFF) }
521 }
522 /// This method must exist for Pernosco to apply its special rendering.
523 pub fn ccc(&self) -> CanonicalCombiningClass {
524 CanonicalCombiningClass::from_icu4c_value((self.0 >> 24) as u8)
525 }
526
527 pub fn character_and_ccc(&self) -> (char, CanonicalCombiningClass) {
528 (self.character(), self.ccc())
529 }
530 pub fn set_ccc_from_trie_if_not_already_set(&mut self, trie: &Trie) {
531 if self.0 >> 24 != 0xFF {
532 return;
533 }
534 let scalar = self.0 & 0xFFFFFF;
535 self.0 =
536 ((ccc_from_trie_value(trie.get32_u32(scalar)).to_icu4c_value() as u32) << 24) | scalar;
537 }
538}
539
540// This function exists as a borrow check helper.
541#[inline(always)]
542fn sort_slice_by_ccc(slice: &mut [CharacterAndClass], trie: &Trie) {
543 // We don't look up the canonical combining class for starters
544 // of for single combining characters between starters. When
545 // there's more than one combining character between starters,
546 // we look up the canonical combining class for each character
547 // exactly once.
548 if slice.len() < 2 {
549 return;
550 }
551 slice
552 .iter_mut()
553 .for_each(|cc| cc.set_ccc_from_trie_if_not_already_set(trie));
554 slice.sort_by_key(|cc| cc.ccc());
555}
556
557/// An iterator adaptor that turns an `Iterator` over `char` into
558/// a lazily-decomposed `char` sequence.
559#[derive(Debug)]
560pub struct Decomposition<'data, I>
561where
562 I: Iterator<Item = char>,
563{
564 delegate: I,
565 buffer: SmallVec<[CharacterAndClass; 17]>, // Enough to hold NFKD for U+FDFA
566 /// The index of the next item to be read from `buffer`.
567 /// The purpose if this index is to avoid having to move
568 /// the rest upon every read.
569 buffer_pos: usize,
570 // At the start of `next()` if not `None`, this is a pending unnormalized
571 // starter. When `Decomposition` appears alone, this is never a non-starter.
572 // However, when `Decomposition` appears inside a `Composition`, this
573 // may become a non-starter before `decomposing_next()` is called.
574 pending: Option<CharacterAndTrieValue>, // None at end of stream
575 // See trie-value-format.md
576 trie: &'data Trie<'data>,
577 scalars16: &'data ZeroSlice<u16>,
578 scalars24: &'data ZeroSlice<char>,
579 supplementary_scalars16: &'data ZeroSlice<u16>,
580 supplementary_scalars24: &'data ZeroSlice<char>,
581 /// The lowest character for which either of the following does
582 /// not hold:
583 /// 1. Decomposes to self.
584 /// 2. Decomposition starts with a non-starter
585 decomposition_passthrough_bound: u32, // never above 0xC0
586 ignorable_behavior: IgnorableBehavior, // Arguably should be a type parameter
587}
588
589impl<'data, I> Decomposition<'data, I>
590where
591 I: Iterator<Item = char>,
592{
593 /// Constructs a decomposing iterator adapter from a delegate
594 /// iterator and references to the necessary data, without
595 /// supplementary data.
596 ///
597 /// Use `DecomposingNormalizer::normalize_iter()` instead unless
598 /// there's a good reason to use this constructor directly.
599 ///
600 /// Public but hidden in order to be able to use this from the
601 /// collator.
602 #[doc(hidden)] // used in collator
603 pub fn new(
604 delegate: I,
605 decompositions: &'data DecompositionData,
606 tables: &'data DecompositionTables,
607 ) -> Self {
608 Self::new_with_supplements(
609 delegate,
610 decompositions,
611 tables,
612 None,
613 0xC0,
614 IgnorableBehavior::Unsupported,
615 )
616 }
617
618 /// Constructs a decomposing iterator adapter from a delegate
619 /// iterator and references to the necessary data, including
620 /// supplementary data.
621 ///
622 /// Use `DecomposingNormalizer::normalize_iter()` instead unless
623 /// there's a good reason to use this constructor directly.
624 fn new_with_supplements(
625 delegate: I,
626 decompositions: &'data DecompositionData,
627 tables: &'data DecompositionTables,
628 supplementary_tables: Option<&'data DecompositionTables>,
629 decomposition_passthrough_bound: u8,
630 ignorable_behavior: IgnorableBehavior,
631 ) -> Self {
632 let mut ret = Decomposition::<I> {
633 delegate,
634 buffer: SmallVec::new(), // Normalized
635 buffer_pos: 0,
636 // Initialize with a placeholder starter in case
637 // the real stream starts with a non-starter.
638 pending: Some(CharacterAndTrieValue::new('\u{FFFF}', 0)),
639 #[allow(clippy::useless_conversion, clippy::expect_used)] // Expectation always succeeds when untyped tries are in use
640 trie: <&Trie>::try_from(&decompositions.trie).expect("Unexpected trie type in data"),
641 scalars16: &tables.scalars16,
642 scalars24: &tables.scalars24,
643 supplementary_scalars16: if let Some(supplementary) = supplementary_tables {
644 &supplementary.scalars16
645 } else {
646 EMPTY_U16
647 },
648 supplementary_scalars24: if let Some(supplementary) = supplementary_tables {
649 &supplementary.scalars24
650 } else {
651 EMPTY_CHAR
652 },
653 decomposition_passthrough_bound: u32::from(decomposition_passthrough_bound),
654 ignorable_behavior,
655 };
656 let _ = ret.next(); // Remove the U+FFFF placeholder
657 ret
658 }
659
660 fn push_decomposition16(
661 &mut self,
662 offset: usize,
663 len: usize,
664 only_non_starters_in_trail: bool,
665 slice16: &ZeroSlice<u16>,
666 ) -> (char, usize) {
667 let (starter, tail) = slice16
668 .get_subslice(offset..offset + len)
669 .and_then(|slice| slice.split_first())
670 .map_or_else(
671 || {
672 // GIGO case
673 debug_assert!(false);
674 (REPLACEMENT_CHARACTER, EMPTY_U16)
675 },
676 |(first, trail)| (char_from_u16(first), trail),
677 );
678 if only_non_starters_in_trail {
679 // All the rest are combining
680 self.buffer.extend(
681 tail.iter()
682 .map(|u| CharacterAndClass::new_with_placeholder(char_from_u16(u))),
683 );
684 (starter, 0)
685 } else {
686 let mut i = 0;
687 let mut combining_start = 0;
688 for u in tail.iter() {
689 let ch = char_from_u16(u);
690 let trie_value = self.trie.get(ch);
691 self.buffer.push(CharacterAndClass::new_with_trie_value(
692 CharacterAndTrieValue::new(ch, trie_value),
693 ));
694 i += 1;
695 // Half-width kana and iota subscript don't occur in the tails
696 // of these multicharacter decompositions.
697 if !decomposition_starts_with_non_starter(trie_value) {
698 combining_start = i;
699 }
700 }
701 (starter, combining_start)
702 }
703 }
704
705 fn push_decomposition32(
706 &mut self,
707 offset: usize,
708 len: usize,
709 only_non_starters_in_trail: bool,
710 slice32: &ZeroSlice<char>,
711 ) -> (char, usize) {
712 let (starter, tail) = slice32
713 .get_subslice(offset..offset + len)
714 .and_then(|slice| slice.split_first())
715 .unwrap_or_else(|| {
716 // GIGO case
717 debug_assert!(false);
718 (REPLACEMENT_CHARACTER, EMPTY_CHAR)
719 });
720 if only_non_starters_in_trail {
721 // All the rest are combining
722 self.buffer
723 .extend(tail.iter().map(CharacterAndClass::new_with_placeholder));
724 (starter, 0)
725 } else {
726 let mut i = 0;
727 let mut combining_start = 0;
728 for ch in tail.iter() {
729 let trie_value = self.trie.get(ch);
730 self.buffer.push(CharacterAndClass::new_with_trie_value(
731 CharacterAndTrieValue::new(ch, trie_value),
732 ));
733 i += 1;
734 // Half-width kana and iota subscript don't occur in the tails
735 // of these multicharacter decompositions.
736 if !decomposition_starts_with_non_starter(trie_value) {
737 combining_start = i;
738 }
739 }
740 (starter, combining_start)
741 }
742 }
743
744 #[inline(always)]
745 fn attach_trie_value(&self, c: char) -> CharacterAndTrieValue {
746 CharacterAndTrieValue::new(c, self.trie.get(c))
747 }
748
749 fn delegate_next_no_pending(&mut self) -> Option<CharacterAndTrieValue> {
750 debug_assert!(self.pending.is_none());
751 loop {
752 let c = self.delegate.next()?;
753
754 // TODO(#2384): Measure if this check is actually an optimization.
755 if u32::from(c) < self.decomposition_passthrough_bound {
756 return Some(CharacterAndTrieValue::new(c, 0));
757 }
758
759 let trie_val = self.trie.get(c);
760 // TODO: Can we do something better about the cost of this branch in the
761 // non-UTS 46 case?
762 if trie_val == IGNORABLE_MARKER {
763 match self.ignorable_behavior {
764 IgnorableBehavior::Unsupported => {
765 debug_assert!(false);
766 }
767 IgnorableBehavior::ReplacementCharacter => {
768 return Some(CharacterAndTrieValue::new(
769 c,
770 u32::from(REPLACEMENT_CHARACTER) | NON_ROUND_TRIP_MARKER,
771 ));
772 }
773 IgnorableBehavior::Ignored => {
774 // Else ignore this character by reading the next one from the delegate.
775 continue;
776 }
777 }
778 }
779 return Some(CharacterAndTrieValue::new(c, trie_val));
780 }
781 }
782
783 fn delegate_next(&mut self) -> Option<CharacterAndTrieValue> {
784 if let Some(pending) = self.pending.take() {
785 // Only happens as part of `Composition` and as part of
786 // the contiguous-buffer methods of `DecomposingNormalizer`.
787 // I.e. does not happen as part of standalone iterator
788 // usage of `Decomposition`.
789 Some(pending)
790 } else {
791 self.delegate_next_no_pending()
792 }
793 }
794
795 fn decomposing_next(&mut self, c_and_trie_val: CharacterAndTrieValue) -> char {
796 let (starter, combining_start) = {
797 let c = c_and_trie_val.character;
798 // See trie-value-format.md
799 let decomposition = c_and_trie_val.trie_val;
800 // The REPLACEMENT CHARACTER has `NON_ROUND_TRIP_MARKER` set,
801 // and that flag needs to be ignored here.
802 if (decomposition & !(BACKWARD_COMBINING_MARKER | NON_ROUND_TRIP_MARKER)) == 0 {
803 // The character is its own decomposition
804 (c, 0)
805 } else {
806 let high_zeros = (decomposition & HIGH_ZEROS_MASK) == 0;
807 let low_zeros = (decomposition & LOW_ZEROS_MASK) == 0;
808 if !high_zeros && !low_zeros {
809 // Decomposition into two BMP characters: starter and non-starter
810 let starter = char_from_u32(decomposition & 0x7FFF);
811 let combining = char_from_u32((decomposition >> 15) & 0x7FFF);
812 self.buffer
813 .push(CharacterAndClass::new_with_placeholder(combining));
814 (starter, 0)
815 } else if high_zeros {
816 // Do the check by looking at `c` instead of looking at a marker
817 // in `singleton` below, because if we looked at the trie value,
818 // we'd still have to check that `c` is in the Hangul syllable
819 // range in order for the subsequent interpretations as `char`
820 // to be safe.
821 // Alternatively, `FDFA_MARKER` and the Hangul marker could
822 // be unified. That would add a branch for Hangul and remove
823 // a branch from singleton decompositions. It seems more
824 // important to favor Hangul syllables than singleton
825 // decompositions.
826 // Note that it would be valid to hoist this Hangul check
827 // one or even two steps earlier in this check hierarchy.
828 // Right now, it's assumed the kind of decompositions into
829 // BMP starter and non-starter, which occur in many languages,
830 // should be checked before Hangul syllables, which are about
831 // one language specifically. Hopefully, we get some
832 // instruction-level parallelism out of the disjointness of
833 // operations on `c` and `decomposition`.
834 let hangul_offset = u32::from(c).wrapping_sub(HANGUL_S_BASE); // SIndex in the spec
835 if hangul_offset < HANGUL_S_COUNT {
836 debug_assert_eq!(decomposition, 1);
837 // Hangul syllable
838 // The math here comes from page 144 of Unicode 14.0
839 let l = hangul_offset / HANGUL_N_COUNT;
840 let v = (hangul_offset % HANGUL_N_COUNT) / HANGUL_T_COUNT;
841 let t = hangul_offset % HANGUL_T_COUNT;
842
843 // The unsafe blocks here are OK, because the values stay
844 // within the Hangul jamo block and, therefore, the scalar
845 // value range by construction.
846 self.buffer.push(CharacterAndClass::new_starter(unsafe {
847 core::char::from_u32_unchecked(HANGUL_V_BASE + v)
848 }));
849 let first = unsafe { core::char::from_u32_unchecked(HANGUL_L_BASE + l) };
850 if t != 0 {
851 self.buffer.push(CharacterAndClass::new_starter(unsafe {
852 core::char::from_u32_unchecked(HANGUL_T_BASE + t)
853 }));
854 (first, 2)
855 } else {
856 (first, 1)
857 }
858 } else {
859 let singleton = decomposition as u16;
860 if singleton != FDFA_MARKER {
861 // Decomposition into one BMP character
862 let starter = char_from_u16(singleton);
863 (starter, 0)
864 } else {
865 // Special case for the NFKD form of U+FDFA.
866 self.buffer.extend(FDFA_NFKD.map(|u| {
867 // SAFETY: `FDFA_NFKD` is known not to contain
868 // surrogates.
869 CharacterAndClass::new_starter(unsafe {
870 core::char::from_u32_unchecked(u32::from(u))
871 })
872 }));
873 ('\u{0635}', 17)
874 }
875 }
876 } else {
877 debug_assert!(low_zeros);
878 // Only 12 of 14 bits used as of Unicode 16.
879 let offset = (((decomposition & !(0b11 << 30)) >> 16) as usize) - 1;
880 // Only 3 of 4 bits used as of Unicode 16.
881 let len_bits = decomposition & 0b1111;
882 let only_non_starters_in_trail = (decomposition & 0b10000) != 0;
883 if offset < self.scalars16.len() {
884 self.push_decomposition16(
885 offset,
886 (len_bits + 2) as usize,
887 only_non_starters_in_trail,
888 self.scalars16,
889 )
890 } else if offset < self.scalars16.len() + self.scalars24.len() {
891 self.push_decomposition32(
892 offset - self.scalars16.len(),
893 (len_bits + 1) as usize,
894 only_non_starters_in_trail,
895 self.scalars24,
896 )
897 } else if offset
898 < self.scalars16.len()
899 + self.scalars24.len()
900 + self.supplementary_scalars16.len()
901 {
902 self.push_decomposition16(
903 offset - (self.scalars16.len() + self.scalars24.len()),
904 (len_bits + 2) as usize,
905 only_non_starters_in_trail,
906 self.supplementary_scalars16,
907 )
908 } else {
909 self.push_decomposition32(
910 offset
911 - (self.scalars16.len()
912 + self.scalars24.len()
913 + self.supplementary_scalars16.len()),
914 (len_bits + 1) as usize,
915 only_non_starters_in_trail,
916 self.supplementary_scalars24,
917 )
918 }
919 }
920 }
921 };
922 // Either we're inside `Composition` or `self.pending.is_none()`.
923
924 self.gather_and_sort_combining(combining_start);
925 starter
926 }
927
928 fn gather_and_sort_combining(&mut self, combining_start: usize) {
929 // Not a `for` loop to avoid holding a mutable reference to `self` across
930 // the loop body.
931 while let Some(ch_and_trie_val) = self.delegate_next() {
932 if !trie_value_has_ccc(ch_and_trie_val.trie_val) {
933 self.pending = Some(ch_and_trie_val);
934 break;
935 } else if !trie_value_indicates_special_non_starter_decomposition(
936 ch_and_trie_val.trie_val,
937 ) {
938 self.buffer
939 .push(CharacterAndClass::new_with_trie_value(ch_and_trie_val));
940 } else {
941 // The Tibetan special cases are starters that decompose into non-starters.
942 let mapped = match ch_and_trie_val.character {
943 '\u{0340}' => {
944 // COMBINING GRAVE TONE MARK
945 CharacterAndClass::new('\u{0300}', CCC_ABOVE)
946 }
947 '\u{0341}' => {
948 // COMBINING ACUTE TONE MARK
949 CharacterAndClass::new('\u{0301}', CCC_ABOVE)
950 }
951 '\u{0343}' => {
952 // COMBINING GREEK KORONIS
953 CharacterAndClass::new('\u{0313}', CCC_ABOVE)
954 }
955 '\u{0344}' => {
956 // COMBINING GREEK DIALYTIKA TONOS
957 self.buffer
958 .push(CharacterAndClass::new('\u{0308}', CCC_ABOVE));
959 CharacterAndClass::new('\u{0301}', CCC_ABOVE)
960 }
961 '\u{0F73}' => {
962 // TIBETAN VOWEL SIGN II
963 self.buffer
964 .push(CharacterAndClass::new('\u{0F71}', ccc!(CCC129, 129)));
965 CharacterAndClass::new('\u{0F72}', ccc!(CCC130, 130))
966 }
967 '\u{0F75}' => {
968 // TIBETAN VOWEL SIGN UU
969 self.buffer
970 .push(CharacterAndClass::new('\u{0F71}', ccc!(CCC129, 129)));
971 CharacterAndClass::new('\u{0F74}', ccc!(CCC132, 132))
972 }
973 '\u{0F81}' => {
974 // TIBETAN VOWEL SIGN REVERSED II
975 self.buffer
976 .push(CharacterAndClass::new('\u{0F71}', ccc!(CCC129, 129)));
977 CharacterAndClass::new('\u{0F80}', ccc!(CCC130, 130))
978 }
979 '\u{FF9E}' => {
980 // HALFWIDTH KATAKANA VOICED SOUND MARK
981 CharacterAndClass::new('\u{3099}', ccc!(KanaVoicing, 8))
982 }
983 '\u{FF9F}' => {
984 // HALFWIDTH KATAKANA VOICED SOUND MARK
985 CharacterAndClass::new('\u{309A}', ccc!(KanaVoicing, 8))
986 }
987 _ => {
988 // GIGO case
989 debug_assert!(false);
990 CharacterAndClass::new_with_placeholder(REPLACEMENT_CHARACTER)
991 }
992 };
993 self.buffer.push(mapped);
994 }
995 }
996 // Slicing succeeds by construction; we've always ensured that `combining_start`
997 // is in permissible range.
998 #[expect(clippy::indexing_slicing)]
999 sort_slice_by_ccc(&mut self.buffer[combining_start..], self.trie);
1000 }
1001}
1002
1003impl<I> Iterator for Decomposition<'_, I>
1004where
1005 I: Iterator<Item = char>,
1006{
1007 type Item = char;
1008
1009 fn next(&mut self) -> Option<char> {
1010 if let Some(ret) = self.buffer.get(self.buffer_pos).map(|c| c.character()) {
1011 self.buffer_pos += 1;
1012 if self.buffer_pos == self.buffer.len() {
1013 self.buffer.clear();
1014 self.buffer_pos = 0;
1015 }
1016 return Some(ret);
1017 }
1018 debug_assert_eq!(self.buffer_pos, 0);
1019 let c_and_trie_val = self.pending.take()?;
1020 Some(self.decomposing_next(c_and_trie_val))
1021 }
1022}
1023
1024/// An iterator adaptor that turns an `Iterator` over `char` into
1025/// a lazily-decomposed and then canonically composed `char` sequence.
1026#[derive(Debug)]
1027pub struct Composition<'data, I>
1028where
1029 I: Iterator<Item = char>,
1030{
1031 /// The decomposing part of the normalizer than operates before
1032 /// the canonical composition is performed on its output.
1033 decomposition: Decomposition<'data, I>,
1034 /// Non-Hangul canonical composition data.
1035 canonical_compositions: Char16Trie<'data>,
1036 /// To make `next()` yield in cases where there's a non-composing
1037 /// starter in the decomposition buffer, we put it here to let it
1038 /// wait for the next `next()` call (or a jump forward within the
1039 /// `next()` call).
1040 unprocessed_starter: Option<char>,
1041 /// The lowest character for which any one of the following does
1042 /// not hold:
1043 /// 1. Roundtrips via decomposition and recomposition.
1044 /// 2. Decomposition starts with a non-starter
1045 /// 3. Is not a backward-combining starter
1046 composition_passthrough_bound: u32,
1047}
1048
1049impl<'data, I> Composition<'data, I>
1050where
1051 I: Iterator<Item = char>,
1052{
1053 fn new(
1054 decomposition: Decomposition<'data, I>,
1055 canonical_compositions: Char16Trie<'data>,
1056 composition_passthrough_bound: u16,
1057 ) -> Self {
1058 Self {
1059 decomposition,
1060 canonical_compositions,
1061 unprocessed_starter: None,
1062 composition_passthrough_bound: u32::from(composition_passthrough_bound),
1063 }
1064 }
1065
1066 /// Performs canonical composition (including Hangul) on a pair of
1067 /// characters or returns `None` if these characters don't compose.
1068 /// Composition exclusions are taken into account.
1069 #[inline(always)]
1070 pub fn compose(&self, starter: char, second: char) -> Option<char> {
1071 compose(self.canonical_compositions.iter(), starter, second)
1072 }
1073
1074 /// Performs (non-Hangul) canonical composition on a pair of characters
1075 /// or returns `None` if these characters don't compose. Composition
1076 /// exclusions are taken into account.
1077 #[inline(always)]
1078 fn compose_non_hangul(&self, starter: char, second: char) -> Option<char> {
1079 compose_non_hangul(self.canonical_compositions.iter(), starter, second)
1080 }
1081}
1082
1083impl<I> Iterator for Composition<'_, I>
1084where
1085 I: Iterator<Item = char>,
1086{
1087 type Item = char;
1088
1089 #[inline]
1090 fn next(&mut self) -> Option<char> {
1091 let mut undecomposed_starter = CharacterAndTrieValue::new('\u{0}', 0); // The compiler can't figure out that this gets overwritten before use.
1092 if self.unprocessed_starter.is_none() {
1093 // The loop is only broken out of as goto forward
1094 #[expect(clippy::never_loop)]
1095 loop {
1096 if let Some((character, ccc)) = self
1097 .decomposition
1098 .buffer
1099 .get(self.decomposition.buffer_pos)
1100 .map(|c| c.character_and_ccc())
1101 {
1102 self.decomposition.buffer_pos += 1;
1103 if self.decomposition.buffer_pos == self.decomposition.buffer.len() {
1104 self.decomposition.buffer.clear();
1105 self.decomposition.buffer_pos = 0;
1106 }
1107 if ccc == CCC_NOT_REORDERED {
1108 // Previous decomposition contains a starter. This must
1109 // now become the `unprocessed_starter` for it to have
1110 // a chance to compose with the upcoming characters.
1111 //
1112 // E.g. parenthesized Hangul in NFKC comes through here,
1113 // but suitable composition exclusion could exercise this
1114 // in NFC.
1115 self.unprocessed_starter = Some(character);
1116 break; // We already have a starter, so skip taking one from `pending`.
1117 }
1118 return Some(character);
1119 }
1120 debug_assert_eq!(self.decomposition.buffer_pos, 0);
1121 undecomposed_starter = self.decomposition.pending.take()?;
1122 if u32::from(undecomposed_starter.character) < self.composition_passthrough_bound
1123 || undecomposed_starter.potential_passthrough()
1124 {
1125 // TODO(#2385): In the NFC case (moot for NFKC and UTS46), if the upcoming
1126 // character is not below `decomposition_passthrough_bound` but is
1127 // below `composition_passthrough_bound`, we read from the trie
1128 // unnecessarily.
1129 if let Some(upcoming) = self.decomposition.delegate_next_no_pending() {
1130 let cannot_combine_backwards = u32::from(upcoming.character)
1131 < self.composition_passthrough_bound
1132 || !upcoming.can_combine_backwards();
1133 self.decomposition.pending = Some(upcoming);
1134 if cannot_combine_backwards {
1135 // Fast-track succeeded!
1136 return Some(undecomposed_starter.character);
1137 }
1138 } else {
1139 // End of stream
1140 return Some(undecomposed_starter.character);
1141 }
1142 }
1143 break; // Not actually looping
1144 }
1145 }
1146 let mut starter = '\u{0}'; // The compiler can't figure out this gets overwritten before use.
1147
1148 // The point of having this boolean is to have only one call site to
1149 // `self.decomposition.decomposing_next`, which is hopefully beneficial for
1150 // code size under inlining.
1151 let mut attempt_composition = false;
1152 loop {
1153 if let Some(unprocessed) = self.unprocessed_starter.take() {
1154 debug_assert_eq!(undecomposed_starter, CharacterAndTrieValue::new('\u{0}', 0));
1155 debug_assert_eq!(starter, '\u{0}');
1156 starter = unprocessed;
1157 } else {
1158 debug_assert_eq!(self.decomposition.buffer_pos, 0);
1159 let next_starter = self.decomposition.decomposing_next(undecomposed_starter);
1160 if !attempt_composition {
1161 starter = next_starter;
1162 } else if let Some(composed) = self.compose(starter, next_starter) {
1163 starter = composed;
1164 } else {
1165 // This is our yield point. We'll pick this up above in the
1166 // next call to `next()`.
1167 self.unprocessed_starter = Some(next_starter);
1168 return Some(starter);
1169 }
1170 }
1171 // We first loop by index to avoid moving the contents of `buffer`, but
1172 // if there's a discontiguous match, we'll start modifying `buffer` instead.
1173 loop {
1174 let (character, ccc) = if let Some((character, ccc)) = self
1175 .decomposition
1176 .buffer
1177 .get(self.decomposition.buffer_pos)
1178 .map(|c| c.character_and_ccc())
1179 {
1180 (character, ccc)
1181 } else {
1182 self.decomposition.buffer.clear();
1183 self.decomposition.buffer_pos = 0;
1184 break;
1185 };
1186 if let Some(composed) = self.compose(starter, character) {
1187 starter = composed;
1188 self.decomposition.buffer_pos += 1;
1189 continue;
1190 }
1191 let mut most_recent_skipped_ccc = ccc;
1192 {
1193 let _ = self
1194 .decomposition
1195 .buffer
1196 .drain(0..self.decomposition.buffer_pos);
1197 }
1198 self.decomposition.buffer_pos = 0;
1199 if most_recent_skipped_ccc == CCC_NOT_REORDERED {
1200 // We failed to compose a starter. Discontiguous match not allowed.
1201 // We leave the starter in `buffer` for `next()` to find.
1202 return Some(starter);
1203 }
1204 let mut i = 1; // We have skipped one non-starter.
1205 while let Some((character, ccc)) = self
1206 .decomposition
1207 .buffer
1208 .get(i)
1209 .map(|c| c.character_and_ccc())
1210 {
1211 if ccc == CCC_NOT_REORDERED {
1212 // Discontiguous match not allowed.
1213 return Some(starter);
1214 }
1215 debug_assert!(ccc >= most_recent_skipped_ccc);
1216 if ccc != most_recent_skipped_ccc {
1217 // Using the non-Hangul version as a micro-optimization, since
1218 // we already rejected the case where `second` is a starter
1219 // above, and conjoining jamo are starters.
1220 if let Some(composed) = self.compose_non_hangul(starter, character) {
1221 self.decomposition.buffer.remove(i);
1222 starter = composed;
1223 continue;
1224 }
1225 }
1226 most_recent_skipped_ccc = ccc;
1227 i += 1;
1228 }
1229 break;
1230 }
1231
1232 debug_assert_eq!(self.decomposition.buffer_pos, 0);
1233
1234 if !self.decomposition.buffer.is_empty() {
1235 return Some(starter);
1236 }
1237 // Now we need to check if composition with an upcoming starter is possible.
1238 #[expect(clippy::unwrap_used)]
1239 if self.decomposition.pending.is_some() {
1240 // We know that `pending_starter` decomposes to start with a starter.
1241 // Otherwise, it would have been moved to `self.decomposition.buffer`
1242 // by `self.decomposing_next()`. We do this set lookup here in order
1243 // to get an opportunity to go back to the fast track.
1244 // Note that this check has to happen _after_ checking that `pending`
1245 // holds a character, because this flag isn't defined to be meaningful
1246 // when `pending` isn't holding a character.
1247 let pending = self.decomposition.pending.as_ref().unwrap();
1248 if u32::from(pending.character) < self.composition_passthrough_bound
1249 || !pending.can_combine_backwards()
1250 {
1251 // Won't combine backwards anyway.
1252 return Some(starter);
1253 }
1254 // Consume what we peeked. `unwrap` OK, because we checked `is_some()`
1255 // above.
1256 undecomposed_starter = self.decomposition.pending.take().unwrap();
1257 // The following line is OK, because we're about to loop back
1258 // to `self.decomposition.decomposing_next(c);`, which will
1259 // restore the between-`next()`-calls invariant of `pending`
1260 // before this function returns.
1261 attempt_composition = true;
1262 continue;
1263 }
1264 // End of input
1265 return Some(starter);
1266 }
1267 }
1268}
1269
1270macro_rules! composing_normalize_to {
1271 ($(#[$meta:meta])*,
1272 $normalize_to:ident,
1273 $write:path,
1274 $slice:ty,
1275 $prolog:block,
1276 $always_valid_utf:literal,
1277 $as_slice:ident,
1278 $fast:block,
1279 $text:ident,
1280 $sink:ident,
1281 $composition:ident,
1282 $composition_passthrough_bound:ident,
1283 $undecomposed_starter:ident,
1284 $pending_slice:ident,
1285 $len_utf:ident,
1286 ) => {
1287 $(#[$meta])*
1288 pub fn $normalize_to<W: $write + ?Sized>(
1289 &self,
1290 $text: $slice,
1291 $sink: &mut W,
1292 ) -> core::fmt::Result {
1293 $prolog
1294 let mut $composition = self.normalize_iter($text.chars());
1295 debug_assert_eq!($composition.decomposition.ignorable_behavior, IgnorableBehavior::Unsupported);
1296 for cc in $composition.decomposition.buffer.drain(..) {
1297 $sink.write_char(cc.character())?;
1298 }
1299
1300 // Try to get the compiler to hoist the bound to a register.
1301 let $composition_passthrough_bound = $composition.composition_passthrough_bound;
1302 'outer: loop {
1303 debug_assert_eq!($composition.decomposition.buffer_pos, 0);
1304 let mut $undecomposed_starter =
1305 if let Some(pending) = $composition.decomposition.pending.take() {
1306 pending
1307 } else {
1308 return Ok(());
1309 };
1310 if u32::from($undecomposed_starter.character) < $composition_passthrough_bound ||
1311 $undecomposed_starter.potential_passthrough()
1312 {
1313 // We don't know if a `REPLACEMENT_CHARACTER` occurred in the slice or
1314 // was returned in response to an error by the iterator. Assume the
1315 // latter for correctness even though it pessimizes the former.
1316 if $always_valid_utf || $undecomposed_starter.character != REPLACEMENT_CHARACTER {
1317 let $pending_slice = &$text[$text.len() - $composition.decomposition.delegate.$as_slice().len() - $undecomposed_starter.character.$len_utf()..];
1318 // The `$fast` block must either:
1319 // 1. Return due to reaching EOF
1320 // 2. Leave a starter with its trie value in `$undecomposed_starter`
1321 // and, if there is still more input, leave the next character
1322 // and its trie value in `$composition.decomposition.pending`.
1323 $fast
1324 }
1325 }
1326 // Fast track above, full algorithm below
1327 let mut starter = $composition
1328 .decomposition
1329 .decomposing_next($undecomposed_starter);
1330 'bufferloop: loop {
1331 // We first loop by index to avoid moving the contents of `buffer`, but
1332 // if there's a discontiguous match, we'll start modifying `buffer` instead.
1333 loop {
1334 let (character, ccc) = if let Some((character, ccc)) = $composition
1335 .decomposition
1336 .buffer
1337 .get($composition.decomposition.buffer_pos)
1338 .map(|c| c.character_and_ccc())
1339 {
1340 (character, ccc)
1341 } else {
1342 $composition.decomposition.buffer.clear();
1343 $composition.decomposition.buffer_pos = 0;
1344 break;
1345 };
1346 if let Some(composed) = $composition.compose(starter, character) {
1347 starter = composed;
1348 $composition.decomposition.buffer_pos += 1;
1349 continue;
1350 }
1351 let mut most_recent_skipped_ccc = ccc;
1352 if most_recent_skipped_ccc == CCC_NOT_REORDERED {
1353 // We failed to compose a starter. Discontiguous match not allowed.
1354 // Write the current `starter` we've been composing, make the unmatched
1355 // starter in the buffer the new `starter` (we know it's been decomposed)
1356 // and process the rest of the buffer with that as the starter.
1357 $sink.write_char(starter)?;
1358 starter = character;
1359 $composition.decomposition.buffer_pos += 1;
1360 continue 'bufferloop;
1361 } else {
1362 {
1363 let _ = $composition
1364 .decomposition
1365 .buffer
1366 .drain(0..$composition.decomposition.buffer_pos);
1367 }
1368 $composition.decomposition.buffer_pos = 0;
1369 }
1370 let mut i = 1; // We have skipped one non-starter.
1371 while let Some((character, ccc)) = $composition
1372 .decomposition
1373 .buffer
1374 .get(i)
1375 .map(|c| c.character_and_ccc())
1376 {
1377 if ccc == CCC_NOT_REORDERED {
1378 // Discontiguous match not allowed.
1379 $sink.write_char(starter)?;
1380 for cc in $composition.decomposition.buffer.drain(..i) {
1381 $sink.write_char(cc.character())?;
1382 }
1383 starter = character;
1384 {
1385 let removed = $composition.decomposition.buffer.remove(0);
1386 debug_assert_eq!(starter, removed.character());
1387 }
1388 debug_assert_eq!($composition.decomposition.buffer_pos, 0);
1389 continue 'bufferloop;
1390 }
1391 debug_assert!(ccc >= most_recent_skipped_ccc);
1392 if ccc != most_recent_skipped_ccc {
1393 // Using the non-Hangul version as a micro-optimization, since
1394 // we already rejected the case where `second` is a starter
1395 // above, and conjoining jamo are starters.
1396 if let Some(composed) =
1397 $composition.compose_non_hangul(starter, character)
1398 {
1399 $composition.decomposition.buffer.remove(i);
1400 starter = composed;
1401 continue;
1402 }
1403 }
1404 most_recent_skipped_ccc = ccc;
1405 i += 1;
1406 }
1407 break;
1408 }
1409 debug_assert_eq!($composition.decomposition.buffer_pos, 0);
1410
1411 if !$composition.decomposition.buffer.is_empty() {
1412 $sink.write_char(starter)?;
1413 for cc in $composition.decomposition.buffer.drain(..) {
1414 $sink.write_char(cc.character())?;
1415 }
1416 // We had non-empty buffer, so can't compose with upcoming.
1417 continue 'outer;
1418 }
1419 // Now we need to check if composition with an upcoming starter is possible.
1420 if $composition.decomposition.pending.is_some() {
1421 // We know that `pending_starter` decomposes to start with a starter.
1422 // Otherwise, it would have been moved to `composition.decomposition.buffer`
1423 // by `composition.decomposing_next()`. We do this set lookup here in order
1424 // to get an opportunity to go back to the fast track.
1425 // Note that this check has to happen _after_ checking that `pending`
1426 // holds a character, because this flag isn't defined to be meaningful
1427 // when `pending` isn't holding a character.
1428 let pending = $composition.decomposition.pending.as_ref().unwrap();
1429 if u32::from(pending.character) < $composition.composition_passthrough_bound
1430 || !pending.can_combine_backwards()
1431 {
1432 // Won't combine backwards anyway.
1433 $sink.write_char(starter)?;
1434 continue 'outer;
1435 }
1436 let pending_starter = $composition.decomposition.pending.take().unwrap();
1437 let decomposed = $composition.decomposition.decomposing_next(pending_starter);
1438 if let Some(composed) = $composition.compose(starter, decomposed) {
1439 starter = composed;
1440 } else {
1441 $sink.write_char(starter)?;
1442 starter = decomposed;
1443 }
1444 continue 'bufferloop;
1445 }
1446 // End of input
1447 $sink.write_char(starter)?;
1448 return Ok(());
1449 } // 'bufferloop
1450 }
1451 }
1452 };
1453}
1454
1455macro_rules! decomposing_normalize_to {
1456 ($(#[$meta:meta])*,
1457 $normalize_to:ident,
1458 $write:path,
1459 $slice:ty,
1460 $prolog:block,
1461 $as_slice:ident,
1462 $fast:block,
1463 $text:ident,
1464 $sink:ident,
1465 $decomposition:ident,
1466 $decomposition_passthrough_bound:ident,
1467 $undecomposed_starter:ident,
1468 $pending_slice:ident,
1469 $outer:lifetime, // loop labels use lifetime tokens
1470 ) => {
1471 $(#[$meta])*
1472 pub fn $normalize_to<W: $write + ?Sized>(
1473 &self,
1474 $text: $slice,
1475 $sink: &mut W,
1476 ) -> core::fmt::Result {
1477 $prolog
1478
1479 let mut $decomposition = self.normalize_iter($text.chars());
1480 debug_assert_eq!($decomposition.ignorable_behavior, IgnorableBehavior::Unsupported);
1481
1482 // Try to get the compiler to hoist the bound to a register.
1483 let $decomposition_passthrough_bound = $decomposition.decomposition_passthrough_bound;
1484 $outer: loop {
1485 for cc in $decomposition.buffer.drain(..) {
1486 $sink.write_char(cc.character())?;
1487 }
1488 debug_assert_eq!($decomposition.buffer_pos, 0);
1489 let mut $undecomposed_starter = if let Some(pending) = $decomposition.pending.take() {
1490 pending
1491 } else {
1492 return Ok(());
1493 };
1494 if $undecomposed_starter.starter_and_decomposes_to_self() {
1495 // Don't bother including `undecomposed_starter` in a contiguous buffer
1496 // write: Just write it right away:
1497 $sink.write_char($undecomposed_starter.character)?;
1498
1499 let $pending_slice = $decomposition.delegate.$as_slice();
1500 $fast
1501 }
1502 let starter = $decomposition.decomposing_next($undecomposed_starter);
1503 $sink.write_char(starter)?;
1504 }
1505 }
1506 };
1507}
1508
1509macro_rules! normalizer_methods {
1510 () => {
1511 /// Normalize a string slice into a `Cow<'a, str>`.
1512 pub fn normalize<'a>(&self, text: &'a str) -> Cow<'a, str> {
1513 let (head, tail) = self.split_normalized(text);
1514 if tail.is_empty() {
1515 return Cow::Borrowed(head);
1516 }
1517 let mut ret = String::new();
1518 ret.reserve(text.len());
1519 ret.push_str(head);
1520 let _ = self.normalize_to(tail, &mut ret);
1521 Cow::Owned(ret)
1522 }
1523
1524 /// Split a string slice into maximum normalized prefix and unnormalized suffix
1525 /// such that the concatenation of the prefix and the normalization of the suffix
1526 /// is the normalization of the whole input.
1527 pub fn split_normalized<'a>(&self, text: &'a str) -> (&'a str, &'a str) {
1528 let up_to = self.is_normalized_up_to(text);
1529 text.split_at_checked(up_to).unwrap_or_else(|| {
1530 // Internal bug, not even GIGO, never supposed to happen
1531 debug_assert!(false);
1532 ("", text)
1533 })
1534 }
1535
1536 /// Return the index a string slice is normalized up to.
1537 fn is_normalized_up_to(&self, text: &str) -> usize {
1538 let mut sink = IsNormalizedSinkStr::new(text);
1539 let _ = self.normalize_to(text, &mut sink);
1540 text.len() - sink.remaining_len()
1541 }
1542
1543 /// Check whether a string slice is normalized.
1544 pub fn is_normalized(&self, text: &str) -> bool {
1545 self.is_normalized_up_to(text) == text.len()
1546 }
1547
1548 /// Normalize a slice of potentially-invalid UTF-16 into a `Cow<'a, [u16]>`.
1549 ///
1550 /// Unpaired surrogates are mapped to the REPLACEMENT CHARACTER
1551 /// before normalizing.
1552 ///
1553 /// ✨ *Enabled with the `utf16_iter` Cargo feature.*
1554 #[cfg(feature = "utf16_iter")]
1555 pub fn normalize_utf16<'a>(&self, text: &'a [u16]) -> Cow<'a, [u16]> {
1556 let (head, tail) = self.split_normalized_utf16(text);
1557 if tail.is_empty() {
1558 return Cow::Borrowed(head);
1559 }
1560 let mut ret = alloc::vec::Vec::with_capacity(text.len());
1561 ret.extend_from_slice(head);
1562 let _ = self.normalize_utf16_to(tail, &mut ret);
1563 Cow::Owned(ret)
1564 }
1565
1566 /// Split a slice of potentially-invalid UTF-16 into maximum normalized (and valid)
1567 /// prefix and unnormalized suffix such that the concatenation of the prefix and the
1568 /// normalization of the suffix is the normalization of the whole input.
1569 ///
1570 /// ✨ *Enabled with the `utf16_iter` Cargo feature.*
1571 #[cfg(feature = "utf16_iter")]
1572 pub fn split_normalized_utf16<'a>(&self, text: &'a [u16]) -> (&'a [u16], &'a [u16]) {
1573 let up_to = self.is_normalized_utf16_up_to(text);
1574 text.split_at_checked(up_to).unwrap_or_else(|| {
1575 // Internal bug, not even GIGO, never supposed to happen
1576 debug_assert!(false);
1577 (&[], text)
1578 })
1579 }
1580
1581 /// Return the index a slice of potentially-invalid UTF-16 is normalized up to.
1582 ///
1583 /// ✨ *Enabled with the `utf16_iter` Cargo feature.*
1584 #[cfg(feature = "utf16_iter")]
1585 fn is_normalized_utf16_up_to(&self, text: &[u16]) -> usize {
1586 let mut sink = IsNormalizedSinkUtf16::new(text);
1587 let _ = self.normalize_utf16_to(text, &mut sink);
1588 text.len() - sink.remaining_len()
1589 }
1590
1591 /// Checks whether a slice of potentially-invalid UTF-16 is normalized.
1592 ///
1593 /// Unpaired surrogates are treated as the REPLACEMENT CHARACTER.
1594 ///
1595 /// ✨ *Enabled with the `utf16_iter` Cargo feature.*
1596 #[cfg(feature = "utf16_iter")]
1597 pub fn is_normalized_utf16(&self, text: &[u16]) -> bool {
1598 self.is_normalized_utf16_up_to(text) == text.len()
1599 }
1600
1601 /// Normalize a slice of potentially-invalid UTF-8 into a `Cow<'a, str>`.
1602 ///
1603 /// Ill-formed byte sequences are mapped to the REPLACEMENT CHARACTER
1604 /// according to the WHATWG Encoding Standard.
1605 ///
1606 /// ✨ *Enabled with the `utf8_iter` Cargo feature.*
1607 #[cfg(feature = "utf8_iter")]
1608 pub fn normalize_utf8<'a>(&self, text: &'a [u8]) -> Cow<'a, str> {
1609 let (head, tail) = self.split_normalized_utf8(text);
1610 if tail.is_empty() {
1611 return Cow::Borrowed(head);
1612 }
1613 let mut ret = String::new();
1614 ret.reserve(text.len());
1615 ret.push_str(head);
1616 let _ = self.normalize_utf8_to(tail, &mut ret);
1617 Cow::Owned(ret)
1618 }
1619
1620 /// Split a slice of potentially-invalid UTF-8 into maximum normalized (and valid)
1621 /// prefix and unnormalized suffix such that the concatenation of the prefix and the
1622 /// normalization of the suffix is the normalization of the whole input.
1623 ///
1624 /// ✨ *Enabled with the `utf8_iter` Cargo feature.*
1625 #[cfg(feature = "utf8_iter")]
1626 pub fn split_normalized_utf8<'a>(&self, text: &'a [u8]) -> (&'a str, &'a [u8]) {
1627 let up_to = self.is_normalized_utf8_up_to(text);
1628 let (head, tail) = text.split_at_checked(up_to).unwrap_or_else(|| {
1629 // Internal bug, not even GIGO, never supposed to happen
1630 debug_assert!(false);
1631 (&[], text)
1632 });
1633 // SAFETY: The normalization check also checks for
1634 // UTF-8 well-formedness.
1635 (unsafe { core::str::from_utf8_unchecked(head) }, tail)
1636 }
1637
1638 /// Return the index a slice of potentially-invalid UTF-8 is normalized up to
1639 ///
1640 /// ✨ *Enabled with the `utf8_iter` Cargo feature.*
1641 #[cfg(feature = "utf8_iter")]
1642 fn is_normalized_utf8_up_to(&self, text: &[u8]) -> usize {
1643 let mut sink = IsNormalizedSinkUtf8::new(text);
1644 let _ = self.normalize_utf8_to(text, &mut sink);
1645 text.len() - sink.remaining_len()
1646 }
1647
1648 /// Check if a slice of potentially-invalid UTF-8 is normalized.
1649 ///
1650 /// Ill-formed byte sequences are mapped to the REPLACEMENT CHARACTER
1651 /// according to the WHATWG Encoding Standard before checking.
1652 ///
1653 /// ✨ *Enabled with the `utf8_iter` Cargo feature.*
1654 #[cfg(feature = "utf8_iter")]
1655 pub fn is_normalized_utf8(&self, text: &[u8]) -> bool {
1656 self.is_normalized_utf8_up_to(text) == text.len()
1657 }
1658 };
1659}
1660
1661/// Borrowed version of a normalizer for performing decomposing normalization.
1662#[derive(Debug)]
1663pub struct DecomposingNormalizerBorrowed<'a> {
1664 decompositions: &'a DecompositionData<'a>,
1665 tables: &'a DecompositionTables<'a>,
1666 supplementary_tables: Option<&'a DecompositionTables<'a>>,
1667 decomposition_passthrough_bound: u8, // never above 0xC0
1668 composition_passthrough_bound: u16, // never above 0x0300
1669}
1670
1671impl DecomposingNormalizerBorrowed<'static> {
1672 /// Cheaply converts a [`DecomposingNormalizerBorrowed<'static>`] into a [`DecomposingNormalizer`].
1673 ///
1674 /// Note: Due to branching and indirection, using [`DecomposingNormalizer`] might inhibit some
1675 /// compile-time optimizations that are possible with [`DecomposingNormalizerBorrowed`].
1676 pub const fn static_to_owned(self) -> DecomposingNormalizer {
1677 DecomposingNormalizer {
1678 decompositions: DataPayload::from_static_ref(self.decompositions),
1679 tables: DataPayload::from_static_ref(self.tables),
1680 supplementary_tables: if let Some(s) = self.supplementary_tables {
1681 // `map` not available in const context
1682 Some(DataPayload::from_static_ref(s))
1683 } else {
1684 None
1685 },
1686 decomposition_passthrough_bound: self.decomposition_passthrough_bound,
1687 composition_passthrough_bound: self.composition_passthrough_bound,
1688 }
1689 }
1690
1691 /// NFD constructor using compiled data.
1692 ///
1693 /// ✨ *Enabled with the `compiled_data` Cargo feature.*
1694 ///
1695 /// [📚 Help choosing a constructor](icu_provider::constructors)
1696 #[cfg(feature = "compiled_data")]
1697 pub const fn new_nfd() -> Self {
1698 const _: () = assert!(
1699 crate::provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1
1700 .scalars16
1701 .const_len()
1702 + crate::provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1
1703 .scalars24
1704 .const_len()
1705 <= 0xFFF,
1706 "future extension"
1707 );
1708
1709 DecomposingNormalizerBorrowed {
1710 decompositions: crate::provider::Baked::SINGLETON_NORMALIZER_NFD_DATA_V1,
1711 tables: crate::provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1,
1712 supplementary_tables: None,
1713 decomposition_passthrough_bound: 0xC0,
1714 composition_passthrough_bound: 0x0300,
1715 }
1716 }
1717
1718 /// NFKD constructor using compiled data.
1719 ///
1720 /// ✨ *Enabled with the `compiled_data` Cargo feature.*
1721 ///
1722 /// [📚 Help choosing a constructor](icu_provider::constructors)
1723 #[cfg(feature = "compiled_data")]
1724 pub const fn new_nfkd() -> Self {
1725 const _: () = assert!(
1726 crate::provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1
1727 .scalars16
1728 .const_len()
1729 + crate::provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1
1730 .scalars24
1731 .const_len()
1732 + crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_TABLES_V1
1733 .scalars16
1734 .const_len()
1735 + crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_TABLES_V1
1736 .scalars24
1737 .const_len()
1738 <= 0xFFF,
1739 "future extension"
1740 );
1741
1742 const _: () = assert!(
1743 crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_DATA_V1.passthrough_cap <= 0x0300,
1744 "invalid"
1745 );
1746
1747 let decomposition_capped =
1748 if crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_DATA_V1.passthrough_cap < 0xC0 {
1749 crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_DATA_V1.passthrough_cap
1750 } else {
1751 0xC0
1752 };
1753 let composition_capped =
1754 if crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_DATA_V1.passthrough_cap < 0x0300 {
1755 crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_DATA_V1.passthrough_cap
1756 } else {
1757 0x0300
1758 };
1759
1760 DecomposingNormalizerBorrowed {
1761 decompositions: crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_DATA_V1,
1762 tables: crate::provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1,
1763 supplementary_tables: Some(crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_TABLES_V1),
1764 decomposition_passthrough_bound: decomposition_capped as u8,
1765 composition_passthrough_bound: composition_capped,
1766 }
1767 }
1768
1769 #[cfg(feature = "compiled_data")]
1770 pub(crate) const fn new_uts46_decomposed() -> Self {
1771 const _: () = assert!(
1772 crate::provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1
1773 .scalars16
1774 .const_len()
1775 + crate::provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1
1776 .scalars24
1777 .const_len()
1778 + crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_TABLES_V1
1779 .scalars16
1780 .const_len()
1781 + crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_TABLES_V1
1782 .scalars24
1783 .const_len()
1784 <= 0xFFF,
1785 "future extension"
1786 );
1787
1788 const _: () = assert!(
1789 crate::provider::Baked::SINGLETON_NORMALIZER_UTS46_DATA_V1.passthrough_cap <= 0x0300,
1790 "invalid"
1791 );
1792
1793 let decomposition_capped =
1794 if crate::provider::Baked::SINGLETON_NORMALIZER_UTS46_DATA_V1.passthrough_cap < 0xC0 {
1795 crate::provider::Baked::SINGLETON_NORMALIZER_UTS46_DATA_V1.passthrough_cap
1796 } else {
1797 0xC0
1798 };
1799 let composition_capped = if crate::provider::Baked::SINGLETON_NORMALIZER_UTS46_DATA_V1
1800 .passthrough_cap
1801 < 0x0300
1802 {
1803 crate::provider::Baked::SINGLETON_NORMALIZER_UTS46_DATA_V1.passthrough_cap
1804 } else {
1805 0x0300
1806 };
1807
1808 DecomposingNormalizerBorrowed {
1809 decompositions: crate::provider::Baked::SINGLETON_NORMALIZER_UTS46_DATA_V1,
1810 tables: crate::provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1,
1811 supplementary_tables: Some(crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_TABLES_V1),
1812 decomposition_passthrough_bound: decomposition_capped as u8,
1813 composition_passthrough_bound: composition_capped,
1814 }
1815 }
1816}
1817
1818impl<'data> DecomposingNormalizerBorrowed<'data> {
1819 /// NFD constructor using already-loaded data.
1820 ///
1821 /// This constructor is intended for use by collations.
1822 ///
1823 /// [📚 Help choosing a constructor](icu_provider::constructors)
1824 #[doc(hidden)]
1825 pub fn new_with_data(
1826 decompositions: &'data DecompositionData<'data>,
1827 tables: &'data DecompositionTables<'data>,
1828 ) -> Self {
1829 Self {
1830 decompositions,
1831 tables,
1832 supplementary_tables: None,
1833 decomposition_passthrough_bound: 0xC0,
1834 composition_passthrough_bound: 0x0300,
1835 }
1836 }
1837
1838 /// Wraps a delegate iterator into a decomposing iterator
1839 /// adapter by using the data already held by this normalizer.
1840 pub fn normalize_iter<I: Iterator<Item = char>>(&self, iter: I) -> Decomposition<'data, I> {
1841 Decomposition::new_with_supplements(
1842 iter,
1843 self.decompositions,
1844 self.tables,
1845 self.supplementary_tables,
1846 self.decomposition_passthrough_bound,
1847 IgnorableBehavior::Unsupported,
1848 )
1849 }
1850
1851 normalizer_methods!();
1852
1853 decomposing_normalize_to!(
1854 /// Normalize a string slice into a `Write` sink.
1855 ,
1856 normalize_to,
1857 core::fmt::Write,
1858 &str,
1859 {
1860 },
1861 as_str,
1862 {
1863 let decomposition_passthrough_byte_bound = if decomposition_passthrough_bound == 0xC0 {
1864 0xC3u8
1865 } else {
1866 decomposition_passthrough_bound.min(0x80) as u8
1867 };
1868 // The attribute belongs on an inner statement, but Rust doesn't allow it there.
1869 #[expect(clippy::unwrap_used)]
1870 'fast: loop {
1871 let mut code_unit_iter = decomposition.delegate.as_str().as_bytes().iter();
1872 'fastest: loop {
1873 if let Some(&upcoming_byte) = code_unit_iter.next() {
1874 if upcoming_byte < decomposition_passthrough_byte_bound {
1875 // Fast-track succeeded!
1876 continue 'fastest;
1877 }
1878 // This deliberately isn't panic-free, since the code pattern
1879 // that was OK for the composing counterpart regressed
1880 // English and French performance if done here, too.
1881 decomposition.delegate = pending_slice[pending_slice.len() - code_unit_iter.as_slice().len() - 1..].chars();
1882 break 'fastest;
1883 }
1884 // End of stream
1885 sink.write_str(pending_slice)?;
1886 return Ok(());
1887 }
1888
1889 // `unwrap()` OK, because the slice is valid UTF-8 and we know there
1890 // is an upcoming byte.
1891 let upcoming = decomposition.delegate.next().unwrap();
1892 let upcoming_with_trie_value = decomposition.attach_trie_value(upcoming);
1893 if upcoming_with_trie_value.starter_and_decomposes_to_self() {
1894 continue 'fast;
1895 }
1896 let consumed_so_far_slice = &pending_slice[..pending_slice.len()
1897 - decomposition.delegate.as_str().len()
1898 - upcoming.len_utf8()];
1899 sink.write_str(consumed_so_far_slice)?;
1900
1901 // Now let's figure out if we got a starter or a non-starter.
1902 if decomposition_starts_with_non_starter(
1903 upcoming_with_trie_value.trie_val,
1904 ) {
1905 // Let this trie value to be reprocessed in case it is
1906 // one of the rare decomposing ones.
1907 decomposition.pending = Some(upcoming_with_trie_value);
1908 decomposition.gather_and_sort_combining(0);
1909 continue 'outer;
1910 }
1911 undecomposed_starter = upcoming_with_trie_value;
1912 debug_assert!(decomposition.pending.is_none());
1913 break 'fast;
1914 }
1915 },
1916 text,
1917 sink,
1918 decomposition,
1919 decomposition_passthrough_bound,
1920 undecomposed_starter,
1921 pending_slice,
1922 'outer,
1923 );
1924
1925 decomposing_normalize_to!(
1926 /// Normalize a slice of potentially-invalid UTF-8 into a `Write` sink.
1927 ///
1928 /// Ill-formed byte sequences are mapped to the REPLACEMENT CHARACTER
1929 /// according to the WHATWG Encoding Standard.
1930 ///
1931 /// ✨ *Enabled with the `utf8_iter` Cargo feature.*
1932 #[cfg(feature = "utf8_iter")]
1933 ,
1934 normalize_utf8_to,
1935 core::fmt::Write,
1936 &[u8],
1937 {
1938 },
1939 as_slice,
1940 {
1941 let decomposition_passthrough_byte_bound = decomposition_passthrough_bound.min(0x80) as u8;
1942 'fast: loop {
1943 let mut code_unit_iter = decomposition.delegate.as_slice().iter();
1944 'fastest: loop {
1945 if let Some(&upcoming_byte) = code_unit_iter.next() {
1946 if upcoming_byte < decomposition_passthrough_byte_bound {
1947 // Fast-track succeeded!
1948 continue 'fastest;
1949 }
1950 break 'fastest;
1951 }
1952 // End of stream
1953 sink.write_str(unsafe { core::str::from_utf8_unchecked(pending_slice) })?;
1954 return Ok(());
1955 }
1956 #[expect(clippy::indexing_slicing)]
1957 {decomposition.delegate = pending_slice[pending_slice.len() - code_unit_iter.as_slice().len() - 1..].chars();}
1958
1959 // `unwrap()` OK, because the slice is valid UTF-8 and we know there
1960 // is an upcoming byte.
1961 #[expect(clippy::unwrap_used)]
1962 let upcoming = decomposition.delegate.next().unwrap();
1963 let upcoming_with_trie_value = decomposition.attach_trie_value(upcoming);
1964 if upcoming_with_trie_value.starter_and_decomposes_to_self_except_replacement() {
1965 // Note: The trie value of the REPLACEMENT CHARACTER is
1966 // intentionally formatted to fail the
1967 // `starter_and_decomposes_to_self` test even though it
1968 // really is a starter that decomposes to self. This
1969 // Allows moving the branch on REPLACEMENT CHARACTER
1970 // below this `continue`.
1971 continue 'fast;
1972 }
1973
1974 // TODO: Annotate as unlikely.
1975 if upcoming == REPLACEMENT_CHARACTER {
1976 // We might have an error, so fall out of the fast path.
1977
1978 // Since the U+FFFD might signify an error, we can't
1979 // assume `upcoming.len_utf8()` for the backoff length.
1980 #[expect(clippy::indexing_slicing)]
1981 let mut consumed_so_far = pending_slice[..pending_slice.len() - decomposition.delegate.as_slice().len()].chars();
1982 let back = consumed_so_far.next_back();
1983 debug_assert_eq!(back, Some(REPLACEMENT_CHARACTER));
1984 let consumed_so_far_slice = consumed_so_far.as_slice();
1985 sink.write_str(unsafe { core::str::from_utf8_unchecked(consumed_so_far_slice) } )?;
1986
1987 // We could call `gather_and_sort_combining` here and
1988 // `continue 'outer`, but this should be better for code
1989 // size.
1990 undecomposed_starter = upcoming_with_trie_value;
1991 debug_assert!(decomposition.pending.is_none());
1992 break 'fast;
1993 }
1994
1995 #[expect(clippy::indexing_slicing)]
1996 let consumed_so_far_slice = &pending_slice[..pending_slice.len()
1997 - decomposition.delegate.as_slice().len()
1998 - upcoming.len_utf8()];
1999 sink.write_str(unsafe { core::str::from_utf8_unchecked(consumed_so_far_slice) } )?;
2000
2001 // Now let's figure out if we got a starter or a non-starter.
2002 if decomposition_starts_with_non_starter(
2003 upcoming_with_trie_value.trie_val,
2004 ) {
2005 // Let this trie value to be reprocessed in case it is
2006 // one of the rare decomposing ones.
2007 decomposition.pending = Some(upcoming_with_trie_value);
2008 decomposition.gather_and_sort_combining(0);
2009 continue 'outer;
2010 }
2011 undecomposed_starter = upcoming_with_trie_value;
2012 debug_assert!(decomposition.pending.is_none());
2013 break 'fast;
2014 }
2015 },
2016 text,
2017 sink,
2018 decomposition,
2019 decomposition_passthrough_bound,
2020 undecomposed_starter,
2021 pending_slice,
2022 'outer,
2023 );
2024
2025 decomposing_normalize_to!(
2026 /// Normalize a slice of potentially-invalid UTF-16 into a `Write16` sink.
2027 ///
2028 /// Unpaired surrogates are mapped to the REPLACEMENT CHARACTER
2029 /// before normalizing.
2030 ///
2031 /// ✨ *Enabled with the `utf16_iter` Cargo feature.*
2032 #[cfg(feature = "utf16_iter")]
2033 ,
2034 normalize_utf16_to,
2035 write16::Write16,
2036 &[u16],
2037 {
2038 sink.size_hint(text.len())?;
2039 },
2040 as_slice,
2041 {
2042 // This loop is only broken out of as goto forward and only as release-build recovery from
2043 // detecting an internal bug without panic. (In debug builds, internal bugs panic instead.)
2044 #[expect(clippy::never_loop)]
2045 'fastwrap: loop {
2046 // Commented out `code_unit_iter` and used `ptr` and `end` to
2047 // work around https://github.com/rust-lang/rust/issues/144684 .
2048 //
2049 // let mut code_unit_iter = decomposition.delegate.as_slice().iter();
2050 let delegate_as_slice = decomposition.delegate.as_slice();
2051 let mut ptr: *const u16 = delegate_as_slice.as_ptr();
2052 // SAFETY: materializing a pointer immediately past the end of an
2053 // allocation is OK.
2054 let end: *const u16 = unsafe { ptr.add(delegate_as_slice.len()) };
2055 'fast: loop {
2056 // if let Some(&upcoming_code_unit) = code_unit_iter.next() {
2057 if ptr != end {
2058 // SAFETY: We just checked that `ptr` has not reached `end`.
2059 // `ptr` always advances by one, and we always have a check
2060 // per advancement.
2061 let upcoming_code_unit = unsafe { *ptr };
2062 // SAFETY: Since `ptr` hadn't reached `end`, yet, advancing
2063 // by one points to the same allocation or to immediately
2064 // after, which is OK.
2065 ptr = unsafe { ptr.add(1) };
2066
2067 let mut upcoming32 = u32::from(upcoming_code_unit);
2068 // The performance of what logically is supposed to be this
2069 // branch is _incredibly_ brittle and what LLVM ends up doing
2070 // that affects the performance of what's logically about this
2071 // decision can swing to double/halve the throughput for Basic
2072 // Latin in ways that are completely unintuitive. Basically _any_
2073 // change to _any_ code that participates in how LLVM sees the
2074 // code around here can make the perf fall over. In seems that
2075 // manually annotating this branch as likely has worse effects
2076 // on non-Basic-Latin input that the case where LLVM just happens to
2077 // do the right thing.
2078 //
2079 // What happens with this branch may depend on what sink type
2080 // this code is monomorphized over.
2081 //
2082 // What a terrible sink of developer time!
2083 if upcoming32 < decomposition_passthrough_bound {
2084 continue 'fast;
2085 }
2086 // We might be doing a trie lookup by surrogate. Surrogates get
2087 // a decomposition to U+FFFD.
2088 let mut trie_value = decomposition.trie.get16(upcoming_code_unit);
2089 if starter_and_decomposes_to_self_impl(trie_value) {
2090 continue 'fast;
2091 }
2092 // We might now be looking at a surrogate.
2093 // The loop is only broken out of as goto forward
2094 #[expect(clippy::never_loop)]
2095 'surrogateloop: loop {
2096 // LLVM's optimizations are incredibly brittle for the code _above_,
2097 // and using `likely` _below_ without using it _above_ helps!
2098 // What a massive sink of developer time!
2099 // Seriously, the effect of these annotations is massively
2100 // unintuitive. Measure everything!
2101 // Notably, the `if likely(...)` formulation optimizes differently
2102 // than just putting `cold_path()` on the `else` path!
2103 let surrogate_base = upcoming32.wrapping_sub(0xD800);
2104 if likely(surrogate_base > (0xDFFF - 0xD800)) {
2105 // Not surrogate
2106 break 'surrogateloop;
2107 }
2108 if likely(surrogate_base <= (0xDBFF - 0xD800)) {
2109 // let iter_backup = code_unit_iter.clone();
2110 // if let Some(&low) = code_unit_iter.next() {
2111 if ptr != end {
2112 // SAFETY: We just checked that `ptr` has not reached `end`.
2113 // `ptr` always advances by one, and we always have a check
2114 // per advancement.
2115 let low = unsafe { *ptr };
2116 if likely(in_inclusive_range16(low, 0xDC00, 0xDFFF)) {
2117 // SAFETY: Since `ptr` hadn't reached `end`, yet, advancing
2118 // by one points to the same allocation or to immediately
2119 // after, which is OK.
2120 ptr = unsafe { ptr.add(1) };
2121
2122 upcoming32 = (upcoming32 << 10) + u32::from(low)
2123 - (((0xD800u32 << 10) - 0x10000u32) + 0xDC00u32);
2124 // Successfully-paired surrogate. Read from the trie again.
2125 trie_value = {
2126 // Semantically, this bit of conditional compilation makes no sense.
2127 // The purpose is to keep LLVM seeing the untyped trie case the way
2128 // it did before so as not to regress the performance of the untyped
2129 // case due to unintuitive optimizer effects. If you care about the
2130 // perf of the untyped trie case and have better ideas, please try
2131 // something better.
2132 #[cfg(not(icu4x_unstable_fast_trie_only))]
2133 {decomposition.trie.get32(upcoming32)}
2134 #[cfg(icu4x_unstable_fast_trie_only)]
2135 {decomposition.trie.get32_supplementary(upcoming32)}
2136 };
2137 if likely(starter_and_decomposes_to_self_impl(trie_value)) {
2138 continue 'fast;
2139 }
2140 break 'surrogateloop;
2141 // } else {
2142 // code_unit_iter = iter_backup;
2143 }
2144 }
2145 }
2146 // unpaired surrogate
2147 upcoming32 = 0xFFFD; // Safe value for `char::from_u32_unchecked` and matches later potential error check.
2148 // trie_value already holds a decomposition to U+FFFD.
2149 break 'surrogateloop;
2150 }
2151
2152 let upcoming = unsafe { char::from_u32_unchecked(upcoming32) };
2153 let upcoming_with_trie_value = CharacterAndTrieValue::new(upcoming, trie_value);
2154
2155
2156 let Some(consumed_so_far_slice) = pending_slice.get(..pending_slice.len() -
2157 // code_unit_iter.as_slice().len()
2158 // SAFETY: `ptr` and `end` have been derived from the same allocation
2159 // and `ptr` is never greater than `end`.
2160 unsafe { end.offset_from(ptr) as usize }
2161 - upcoming.len_utf16()) else {
2162 // If we ever come here, it's a bug, but let's avoid panic code paths in release builds.
2163 debug_assert!(false);
2164 // Throw away the results of the fast path.
2165 break 'fastwrap;
2166 };
2167 sink.write_slice(consumed_so_far_slice)?;
2168
2169 if decomposition_starts_with_non_starter(
2170 upcoming_with_trie_value.trie_val,
2171 ) {
2172 // Sync with main iterator
2173 // decomposition.delegate = code_unit_iter.as_slice().chars();
2174 // SAFETY: `ptr` and `end` have been derived from the same allocation
2175 // and `ptr` is never greater than `end`.
2176 decomposition.delegate = unsafe { core::slice::from_raw_parts(ptr, end.offset_from(ptr) as usize) }.chars();
2177 // Let this trie value to be reprocessed in case it is
2178 // one of the rare decomposing ones.
2179 decomposition.pending = Some(upcoming_with_trie_value);
2180 decomposition.gather_and_sort_combining(0);
2181 continue 'outer;
2182 }
2183 undecomposed_starter = upcoming_with_trie_value;
2184 debug_assert!(decomposition.pending.is_none());
2185 break 'fast;
2186 }
2187 // End of stream
2188 sink.write_slice(pending_slice)?;
2189 return Ok(());
2190 }
2191 // Sync the main iterator
2192 // decomposition.delegate = code_unit_iter.as_slice().chars();
2193 // SAFETY: `ptr` and `end` have been derived from the same allocation
2194 // and `ptr` is never greater than `end`.
2195 decomposition.delegate = unsafe { core::slice::from_raw_parts(ptr, end.offset_from(ptr) as usize) }.chars();
2196 break 'fastwrap;
2197 }
2198 },
2199 text,
2200 sink,
2201 decomposition,
2202 decomposition_passthrough_bound,
2203 undecomposed_starter,
2204 pending_slice,
2205 'outer,
2206 );
2207}
2208
2209/// A normalizer for performing decomposing normalization.
2210#[derive(Debug)]
2211pub struct DecomposingNormalizer {
2212 decompositions: DataPayload<NormalizerNfdDataV1>,
2213 tables: DataPayload<NormalizerNfdTablesV1>,
2214 supplementary_tables: Option<DataPayload<NormalizerNfkdTablesV1>>,
2215 decomposition_passthrough_bound: u8, // never above 0xC0
2216 composition_passthrough_bound: u16, // never above 0x0300
2217}
2218
2219impl DecomposingNormalizer {
2220 /// Constructs a borrowed version of this type for more efficient querying.
2221 pub fn as_borrowed(&self) -> DecomposingNormalizerBorrowed<'_> {
2222 DecomposingNormalizerBorrowed {
2223 decompositions: self.decompositions.get(),
2224 tables: self.tables.get(),
2225 supplementary_tables: self.supplementary_tables.as_ref().map(|s| s.get()),
2226 decomposition_passthrough_bound: self.decomposition_passthrough_bound,
2227 composition_passthrough_bound: self.composition_passthrough_bound,
2228 }
2229 }
2230
2231 /// NFD constructor using compiled data.
2232 ///
2233 /// ✨ *Enabled with the `compiled_data` Cargo feature.*
2234 ///
2235 /// [📚 Help choosing a constructor](icu_provider::constructors)
2236 #[cfg(feature = "compiled_data")]
2237 pub const fn new_nfd() -> DecomposingNormalizerBorrowed<'static> {
2238 DecomposingNormalizerBorrowed::new_nfd()
2239 }
2240
2241 icu_provider::gen_buffer_data_constructors!(
2242 () -> error: DataError,
2243 functions: [
2244 new_nfd: skip,
2245 try_new_nfd_with_buffer_provider,
2246 try_new_nfd_unstable,
2247 Self,
2248 ]
2249 );
2250
2251 #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_nfd)]
2252 pub fn try_new_nfd_unstable<D>(provider: &D) -> Result<Self, DataError>
2253 where
2254 D: DataProvider<NormalizerNfdDataV1> + DataProvider<NormalizerNfdTablesV1> + ?Sized,
2255 {
2256 let decompositions: DataPayload<NormalizerNfdDataV1> =
2257 provider.load(Default::default())?.payload;
2258 let tables: DataPayload<NormalizerNfdTablesV1> = provider.load(Default::default())?.payload;
2259
2260 if tables.get().scalars16.len() + tables.get().scalars24.len() > 0xFFF {
2261 // The data is from a future where there exists a normalization flavor whose
2262 // complex decompositions take more than 0xFFF but fewer than 0x1FFF code points
2263 // of space. If a good use case from such a decomposition flavor arises, we can
2264 // dynamically change the bit masks so that the length mask becomes 0x1FFF instead
2265 // of 0xFFF and the all-non-starters mask becomes 0 instead of 0x1000. However,
2266 // since for now the masks are hard-coded, error out.
2267 return Err(
2268 DataError::custom("future extension").with_marker(NormalizerNfdTablesV1::INFO)
2269 );
2270 }
2271
2272 let cap = decompositions.get().passthrough_cap;
2273 if cap > 0x0300 {
2274 return Err(DataError::custom("invalid").with_marker(NormalizerNfdDataV1::INFO));
2275 }
2276 let decomposition_capped = cap.min(0xC0);
2277 let composition_capped = cap.min(0x0300);
2278
2279 Ok(DecomposingNormalizer {
2280 decompositions,
2281 tables,
2282 supplementary_tables: None,
2283 decomposition_passthrough_bound: decomposition_capped as u8,
2284 composition_passthrough_bound: composition_capped,
2285 })
2286 }
2287
2288 icu_provider::gen_buffer_data_constructors!(
2289 () -> error: DataError,
2290 functions: [
2291 new_nfkd: skip,
2292 try_new_nfkd_with_buffer_provider,
2293 try_new_nfkd_unstable,
2294 Self,
2295 ]
2296 );
2297
2298 /// NFKD constructor using compiled data.
2299 ///
2300 /// ✨ *Enabled with the `compiled_data` Cargo feature.*
2301 ///
2302 /// [📚 Help choosing a constructor](icu_provider::constructors)
2303 #[cfg(feature = "compiled_data")]
2304 pub const fn new_nfkd() -> DecomposingNormalizerBorrowed<'static> {
2305 DecomposingNormalizerBorrowed::new_nfkd()
2306 }
2307
2308 #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_nfkd)]
2309 pub fn try_new_nfkd_unstable<D>(provider: &D) -> Result<Self, DataError>
2310 where
2311 D: DataProvider<NormalizerNfkdDataV1>
2312 + DataProvider<NormalizerNfdTablesV1>
2313 + DataProvider<NormalizerNfkdTablesV1>
2314 + ?Sized,
2315 {
2316 let decompositions: DataPayload<NormalizerNfkdDataV1> =
2317 provider.load(Default::default())?.payload;
2318 let tables: DataPayload<NormalizerNfdTablesV1> = provider.load(Default::default())?.payload;
2319 let supplementary_tables: DataPayload<NormalizerNfkdTablesV1> =
2320 provider.load(Default::default())?.payload;
2321
2322 if tables.get().scalars16.len()
2323 + tables.get().scalars24.len()
2324 + supplementary_tables.get().scalars16.len()
2325 + supplementary_tables.get().scalars24.len()
2326 > 0xFFF
2327 {
2328 // The data is from a future where there exists a normalization flavor whose
2329 // complex decompositions take more than 0xFFF but fewer than 0x1FFF code points
2330 // of space. If a good use case from such a decomposition flavor arises, we can
2331 // dynamically change the bit masks so that the length mask becomes 0x1FFF instead
2332 // of 0xFFF and the all-non-starters mask becomes 0 instead of 0x1000. However,
2333 // since for now the masks are hard-coded, error out.
2334 return Err(
2335 DataError::custom("future extension").with_marker(NormalizerNfdTablesV1::INFO)
2336 );
2337 }
2338
2339 let cap = decompositions.get().passthrough_cap;
2340 if cap > 0x0300 {
2341 return Err(DataError::custom("invalid").with_marker(NormalizerNfkdDataV1::INFO));
2342 }
2343 let decomposition_capped = cap.min(0xC0);
2344 let composition_capped = cap.min(0x0300);
2345
2346 Ok(DecomposingNormalizer {
2347 decompositions: decompositions.cast(),
2348 tables,
2349 supplementary_tables: Some(supplementary_tables),
2350 decomposition_passthrough_bound: decomposition_capped as u8,
2351 composition_passthrough_bound: composition_capped,
2352 })
2353 }
2354
2355 /// UTS 46 decomposed constructor (testing only)
2356 ///
2357 /// This is a special building block normalization for IDNA. It is the decomposed counterpart of
2358 /// ICU4C's UTS 46 normalization with two exceptions: characters that UTS 46 disallows and
2359 /// ICU4C maps to U+FFFD and characters that UTS 46 maps to the empty string normalize as in
2360 /// NFD in this normalization. In both cases, the previous UTS 46 processing before using
2361 /// normalization is expected to deal with these characters. Making the disallowed characters
2362 /// behave like this is beneficial to data size, and this normalizer implementation cannot
2363 /// deal with a character normalizing to the empty string, which doesn't happen in NFD or
2364 /// NFKD as of Unicode 14.
2365 ///
2366 /// Warning: In this normalization, U+0345 COMBINING GREEK YPOGEGRAMMENI exhibits a behavior
2367 /// that no character in Unicode exhibits in NFD, NFKD, NFC, or NFKC: Case folding turns
2368 /// U+0345 from a reordered character into a non-reordered character before reordering happens.
2369 /// Therefore, the output of this normalization may differ for different inputs that are
2370 /// canonically equivalent with each other if they differ by how U+0345 is ordered relative
2371 /// to other reorderable characters.
2372 pub(crate) fn try_new_uts46_decomposed_unstable<D>(provider: &D) -> Result<Self, DataError>
2373 where
2374 D: DataProvider<NormalizerUts46DataV1>
2375 + DataProvider<NormalizerNfdTablesV1>
2376 + DataProvider<NormalizerNfkdTablesV1>
2377 // UTS 46 tables merged into CompatibilityDecompositionTablesV1
2378 + ?Sized,
2379 {
2380 let decompositions: DataPayload<NormalizerUts46DataV1> =
2381 provider.load(Default::default())?.payload;
2382 let tables: DataPayload<NormalizerNfdTablesV1> = provider.load(Default::default())?.payload;
2383 let supplementary_tables: DataPayload<NormalizerNfkdTablesV1> =
2384 provider.load(Default::default())?.payload;
2385
2386 if tables.get().scalars16.len()
2387 + tables.get().scalars24.len()
2388 + supplementary_tables.get().scalars16.len()
2389 + supplementary_tables.get().scalars24.len()
2390 > 0xFFF
2391 {
2392 // The data is from a future where there exists a normalization flavor whose
2393 // complex decompositions take more than 0xFFF but fewer than 0x1FFF code points
2394 // of space. If a good use case from such a decomposition flavor arises, we can
2395 // dynamically change the bit masks so that the length mask becomes 0x1FFF instead
2396 // of 0xFFF and the all-non-starters mask becomes 0 instead of 0x1000. However,
2397 // since for now the masks are hard-coded, error out.
2398 return Err(
2399 DataError::custom("future extension").with_marker(NormalizerNfdTablesV1::INFO)
2400 );
2401 }
2402
2403 let cap = decompositions.get().passthrough_cap;
2404 if cap > 0x0300 {
2405 return Err(DataError::custom("invalid").with_marker(NormalizerUts46DataV1::INFO));
2406 }
2407 let decomposition_capped = cap.min(0xC0);
2408 let composition_capped = cap.min(0x0300);
2409
2410 Ok(DecomposingNormalizer {
2411 decompositions: decompositions.cast(),
2412 tables,
2413 supplementary_tables: Some(supplementary_tables),
2414 decomposition_passthrough_bound: decomposition_capped as u8,
2415 composition_passthrough_bound: composition_capped,
2416 })
2417 }
2418}
2419
2420/// Borrowed version of a normalizer for performing composing normalization.
2421#[derive(Debug)]
2422pub struct ComposingNormalizerBorrowed<'a> {
2423 decomposing_normalizer: DecomposingNormalizerBorrowed<'a>,
2424 canonical_compositions: &'a CanonicalCompositions<'a>,
2425}
2426
2427impl ComposingNormalizerBorrowed<'static> {
2428 /// Cheaply converts a [`ComposingNormalizerBorrowed<'static>`] into a [`ComposingNormalizer`].
2429 ///
2430 /// Note: Due to branching and indirection, using [`ComposingNormalizer`] might inhibit some
2431 /// compile-time optimizations that are possible with [`ComposingNormalizerBorrowed`].
2432 pub const fn static_to_owned(self) -> ComposingNormalizer {
2433 ComposingNormalizer {
2434 decomposing_normalizer: self.decomposing_normalizer.static_to_owned(),
2435 canonical_compositions: DataPayload::from_static_ref(self.canonical_compositions),
2436 }
2437 }
2438
2439 /// NFC constructor using compiled data.
2440 ///
2441 /// ✨ *Enabled with the `compiled_data` Cargo feature.*
2442 ///
2443 /// [📚 Help choosing a constructor](icu_provider::constructors)
2444 #[cfg(feature = "compiled_data")]
2445 pub const fn new_nfc() -> Self {
2446 ComposingNormalizerBorrowed {
2447 decomposing_normalizer: DecomposingNormalizerBorrowed::new_nfd(),
2448 canonical_compositions: crate::provider::Baked::SINGLETON_NORMALIZER_NFC_V1,
2449 }
2450 }
2451
2452 /// NFKC constructor using compiled data.
2453 ///
2454 /// ✨ *Enabled with the `compiled_data` Cargo feature.*
2455 ///
2456 /// [📚 Help choosing a constructor](icu_provider::constructors)
2457 #[cfg(feature = "compiled_data")]
2458 pub const fn new_nfkc() -> Self {
2459 ComposingNormalizerBorrowed {
2460 decomposing_normalizer: DecomposingNormalizerBorrowed::new_nfkd(),
2461 canonical_compositions: crate::provider::Baked::SINGLETON_NORMALIZER_NFC_V1,
2462 }
2463 }
2464
2465 /// This is a special building block normalization for IDNA that implements parts of the Map
2466 /// step and the following Normalize step.
2467 ///
2468 /// Warning: In this normalization, U+0345 COMBINING GREEK YPOGEGRAMMENI exhibits a behavior
2469 /// that no character in Unicode exhibits in NFD, NFKD, NFC, or NFKC: Case folding turns
2470 /// U+0345 from a reordered character into a non-reordered character before reordering happens.
2471 /// Therefore, the output of this normalization may differ for different inputs that are
2472 /// canonically equivalents with each other if they differ by how U+0345 is ordered relative
2473 /// to other reorderable characters.
2474 #[cfg(feature = "compiled_data")]
2475 pub(crate) const fn new_uts46() -> Self {
2476 ComposingNormalizerBorrowed {
2477 decomposing_normalizer: DecomposingNormalizerBorrowed::new_uts46_decomposed(),
2478 canonical_compositions: crate::provider::Baked::SINGLETON_NORMALIZER_NFC_V1,
2479 }
2480 }
2481}
2482
2483impl<'data> ComposingNormalizerBorrowed<'data> {
2484 /// Wraps a delegate iterator into a composing iterator
2485 /// adapter by using the data already held by this normalizer.
2486 pub fn normalize_iter<I: Iterator<Item = char>>(&self, iter: I) -> Composition<'data, I> {
2487 self.normalize_iter_private(iter, IgnorableBehavior::Unsupported)
2488 }
2489
2490 fn normalize_iter_private<I: Iterator<Item = char>>(
2491 &self,
2492 iter: I,
2493 ignorable_behavior: IgnorableBehavior,
2494 ) -> Composition<'data, I> {
2495 Composition::new(
2496 Decomposition::new_with_supplements(
2497 iter,
2498 self.decomposing_normalizer.decompositions,
2499 self.decomposing_normalizer.tables,
2500 self.decomposing_normalizer.supplementary_tables,
2501 self.decomposing_normalizer.decomposition_passthrough_bound,
2502 ignorable_behavior,
2503 ),
2504 self.canonical_compositions.canonical_compositions.clone(),
2505 self.decomposing_normalizer.composition_passthrough_bound,
2506 )
2507 }
2508
2509 normalizer_methods!();
2510
2511 composing_normalize_to!(
2512 /// Normalize a string slice into a `Write` sink.
2513 ,
2514 normalize_to,
2515 core::fmt::Write,
2516 &str,
2517 {},
2518 true,
2519 as_str,
2520 {
2521 // Let's hope LICM hoists this outside `'outer`.
2522 let composition_passthrough_byte_bound = if composition_passthrough_bound == 0x300 {
2523 0xCCu8
2524 } else {
2525 // We can make this fancy if a normalization other than NFC where looking at
2526 // non-ASCII lead bytes is worthwhile is ever introduced.
2527 composition_passthrough_bound.min(0x80) as u8
2528 };
2529 // Attributes have to be on blocks, so hoisting all the way here.
2530 #[expect(clippy::unwrap_used)]
2531 'fast: loop {
2532 let mut code_unit_iter = composition.decomposition.delegate.as_str().as_bytes().iter();
2533 'fastest: loop {
2534 if let Some(&upcoming_byte) = code_unit_iter.next() {
2535 if upcoming_byte < composition_passthrough_byte_bound {
2536 // Fast-track succeeded!
2537 continue 'fastest;
2538 }
2539 let Some(remaining_slice) = pending_slice.get(pending_slice.len() - code_unit_iter.as_slice().len() - 1..) else {
2540 // If we ever come here, it's an internal bug. Let's avoid panic code paths in release builds.
2541 debug_assert!(false);
2542 // Throw away the fastest-path result in case of an internal bug.
2543 break 'fastest;
2544 };
2545 composition.decomposition.delegate = remaining_slice.chars();
2546 break 'fastest;
2547 }
2548 // End of stream
2549 sink.write_str(pending_slice)?;
2550 return Ok(());
2551 }
2552 // `unwrap()` OK, because the slice is valid UTF-8 and we know there
2553 // is an upcoming byte.
2554 let upcoming = composition.decomposition.delegate.next().unwrap();
2555 let upcoming_with_trie_value = composition.decomposition.attach_trie_value(upcoming);
2556 if upcoming_with_trie_value.potential_passthrough_and_cannot_combine_backwards() {
2557 // Can't combine backwards, hence a plain (non-backwards-combining)
2558 // starter albeit past `composition_passthrough_bound`
2559
2560 // Fast-track succeeded!
2561 continue 'fast;
2562 }
2563 // We need to fall off the fast path.
2564 composition.decomposition.pending = Some(upcoming_with_trie_value);
2565
2566 // slicing and unwrap OK, because we've just evidently read enough previously.
2567 let mut consumed_so_far = pending_slice[..pending_slice.len() - composition.decomposition.delegate.as_str().len() - upcoming.len_utf8()].chars();
2568 // `unwrap` OK, because we've previously manage to read the previous character
2569 undecomposed_starter = composition.decomposition.attach_trie_value(consumed_so_far.next_back().unwrap());
2570 let consumed_so_far_slice = consumed_so_far.as_str();
2571 sink.write_str(consumed_so_far_slice)?;
2572 break 'fast;
2573 }
2574 },
2575 text,
2576 sink,
2577 composition,
2578 composition_passthrough_bound,
2579 undecomposed_starter,
2580 pending_slice,
2581 len_utf8,
2582 );
2583
2584 composing_normalize_to!(
2585 /// Normalize a slice of potentially-invalid UTF-8 into a `Write` sink.
2586 ///
2587 /// Ill-formed byte sequences are mapped to the REPLACEMENT CHARACTER
2588 /// according to the WHATWG Encoding Standard.
2589 ///
2590 /// ✨ *Enabled with the `utf8_iter` Cargo feature.*
2591 #[cfg(feature = "utf8_iter")]
2592 ,
2593 normalize_utf8_to,
2594 core::fmt::Write,
2595 &[u8],
2596 {},
2597 false,
2598 as_slice,
2599 {
2600 'fast: loop {
2601 if let Some(upcoming) = composition.decomposition.delegate.next() {
2602 if u32::from(upcoming) < composition_passthrough_bound {
2603 // Fast-track succeeded!
2604 continue 'fast;
2605 }
2606 // TODO: Be statically aware of fast/small trie.
2607 let upcoming_with_trie_value = composition.decomposition.attach_trie_value(upcoming);
2608 if upcoming_with_trie_value.potential_passthrough_and_cannot_combine_backwards() {
2609 // Note: The trie value of the REPLACEMENT CHARACTER is
2610 // intentionally formatted to fail the
2611 // `potential_passthrough_and_cannot_combine_backwards`
2612 // test even though it really is a starter that decomposes
2613 // to self and cannot combine backwards. This
2614 // Allows moving the branch on REPLACEMENT CHARACTER
2615 // below this `continue`.
2616 continue 'fast;
2617 }
2618 // We need to fall off the fast path.
2619
2620 // TODO(#2006): Annotate as unlikely
2621 if upcoming == REPLACEMENT_CHARACTER {
2622 // Can't tell if this is an error or a literal U+FFFD in
2623 // the input. Assuming the former to be sure.
2624
2625 // Since the U+FFFD might signify an error, we can't
2626 // assume `upcoming.len_utf8()` for the backoff length.
2627 #[expect(clippy::indexing_slicing)]
2628 let mut consumed_so_far = pending_slice[..pending_slice.len() - composition.decomposition.delegate.as_slice().len()].chars();
2629 let back = consumed_so_far.next_back();
2630 debug_assert_eq!(back, Some(REPLACEMENT_CHARACTER));
2631 let consumed_so_far_slice = consumed_so_far.as_slice();
2632 sink.write_str(unsafe { core::str::from_utf8_unchecked(consumed_so_far_slice) })?;
2633 undecomposed_starter = CharacterAndTrieValue::new(REPLACEMENT_CHARACTER, 0);
2634 composition.decomposition.pending = None;
2635 break 'fast;
2636 }
2637
2638 composition.decomposition.pending = Some(upcoming_with_trie_value);
2639 // slicing and unwrap OK, because we've just evidently read enough previously.
2640 // `unwrap` OK, because we've previously manage to read the previous character
2641 #[expect(clippy::indexing_slicing)]
2642 let mut consumed_so_far = pending_slice[..pending_slice.len() - composition.decomposition.delegate.as_slice().len() - upcoming.len_utf8()].chars();
2643 #[expect(clippy::unwrap_used)]
2644 {
2645 // TODO: If the previous character was below the passthrough bound,
2646 // we really need to read from the trie. Otherwise, we could maintain
2647 // the most-recent trie value. Need to measure what's more expensive:
2648 // Remembering the trie value on each iteration or re-reading the
2649 // last one after the fast-track run.
2650 undecomposed_starter = composition.decomposition.attach_trie_value(consumed_so_far.next_back().unwrap());
2651 }
2652 let consumed_so_far_slice = consumed_so_far.as_slice();
2653 sink.write_str(unsafe { core::str::from_utf8_unchecked(consumed_so_far_slice)})?;
2654 break 'fast;
2655 }
2656 // End of stream
2657 sink.write_str(unsafe { core::str::from_utf8_unchecked(pending_slice) })?;
2658 return Ok(());
2659 }
2660 },
2661 text,
2662 sink,
2663 composition,
2664 composition_passthrough_bound,
2665 undecomposed_starter,
2666 pending_slice,
2667 len_utf8,
2668 );
2669
2670 composing_normalize_to!(
2671 /// Normalize a slice of potentially-invalid UTF-16 into a `Write16` sink.
2672 ///
2673 /// Unpaired surrogates are mapped to the REPLACEMENT CHARACTER
2674 /// before normalizing.
2675 ///
2676 /// ✨ *Enabled with the `utf16_iter` Cargo feature.*
2677 #[cfg(feature = "utf16_iter")]
2678 ,
2679 normalize_utf16_to,
2680 write16::Write16,
2681 &[u16],
2682 {
2683 sink.size_hint(text.len())?;
2684 },
2685 false,
2686 as_slice,
2687 {
2688 // This loop is only broken out of as goto forward and only as release-build recovery from
2689 // detecting an internal bug without panic. (In debug builds, internal bugs panic instead.)
2690 #[expect(clippy::never_loop)]
2691 'fastwrap: loop {
2692 // Commented out `code_unit_iter` and used `ptr` and `end` to
2693 // work around https://github.com/rust-lang/rust/issues/144684 .
2694 //
2695 // let mut code_unit_iter = composition.decomposition.delegate.as_slice().iter();
2696 let delegate_as_slice = composition.decomposition.delegate.as_slice();
2697 let mut ptr: *const u16 = delegate_as_slice.as_ptr();
2698 // SAFETY: materializing a pointer immediately past the end of an
2699 // allocation is OK.
2700 let end: *const u16 = unsafe { ptr.add(delegate_as_slice.len()) };
2701
2702 'fast: loop {
2703 // if let Some(&upcoming_code_unit) = code_unit_iter.next() {
2704 if ptr != end {
2705 // SAFETY: We just checked that `ptr` has not reached `end`.
2706 // `ptr` always advances by one, and we always have a check
2707 // per advancement.
2708 let upcoming_code_unit = unsafe { *ptr };
2709 // SAFETY: Since `ptr` hadn't reached `end`, yet, advancing
2710 // by one points to the same allocation or to immediately
2711 // after, which is OK.
2712 ptr = unsafe { ptr.add(1) };
2713
2714 let mut upcoming32 = u32::from(upcoming_code_unit); // may be surrogate
2715 // The performance of what logically is supposed to be this
2716 // branch is somewhat brittle and what LLVM ends up doing
2717 // that affects the performance of what's logically about this
2718 // decision can swing to double/halve the throughput for Basic
2719 // Latin in ways that are completely unintuitive. Basically _any_
2720 // change to _any_ code that participates in how LLVM sees the
2721 // code around here can make the perf fall over. In seems that
2722 // manually annotating this branch as likely has worse effects
2723 // on non-Basic-Latin input that the case where LLVM just happens to
2724 // do the right thing.
2725 //
2726 // What happens with this branch may depend on what sink type
2727 // this code is monomorphized over.
2728 //
2729 // What a terrible sink of developer time!
2730 if upcoming32 < composition_passthrough_bound {
2731 // No need for surrogate or U+FFFD check, because
2732 // `composition_passthrough_bound` cannot be higher than
2733 // U+0300.
2734 // Fast-track succeeded!
2735 continue 'fast;
2736 }
2737 // We might be doing a trie lookup by surrogate. Surrogates get
2738 // a decomposition to U+FFFD.
2739 let mut trie_value = composition.decomposition.trie.get16(upcoming_code_unit);
2740 if potential_passthrough_and_cannot_combine_backwards_impl(trie_value) {
2741 // Can't combine backwards, hence a plain (non-backwards-combining)
2742 // starter albeit past `composition_passthrough_bound`
2743
2744 // Fast-track succeeded!
2745 continue 'fast;
2746 }
2747
2748 // We might now be looking at a surrogate.
2749 // The loop is only broken out of as goto forward
2750 #[expect(clippy::never_loop)]
2751 'surrogateloop: loop {
2752 // The `likely` annotations _below_ exist to make the code _above_
2753 // go faster!
2754 let surrogate_base = upcoming32.wrapping_sub(0xD800);
2755 if likely(surrogate_base > (0xDFFF - 0xD800)) {
2756 // Not surrogate
2757 break 'surrogateloop;
2758 }
2759 if likely(surrogate_base <= (0xDBFF - 0xD800)) {
2760 // let iter_backup = code_unit_iter.clone();
2761 // if let Some(&low) = code_unit_iter.next() {
2762 if ptr != end {
2763 // SAFETY: We just checked that `ptr` has not reached `end`.
2764 // `ptr` always advances by one, and we always have a check
2765 // per advancement.
2766 let low = unsafe { *ptr };
2767 if likely(in_inclusive_range16(low, 0xDC00, 0xDFFF)) {
2768 // SAFETY: Since `ptr` hadn't reached `end`, yet, advancing
2769 // by one points to the same allocation or to immediately
2770 // after, which is OK.
2771 ptr = unsafe { ptr.add(1) };
2772
2773 upcoming32 = (upcoming32 << 10) + u32::from(low)
2774 - (((0xD800u32 << 10) - 0x10000u32) + 0xDC00u32);
2775 // Successfully-paired surrogate. Read from the trie again.
2776 trie_value = {
2777 // Semantically, this bit of conditional compilation makes no sense.
2778 // The purpose is to keep LLVM seeing the untyped trie case the way
2779 // it did before so as not to regress the performance of the untyped
2780 // case due to unintuitive optimizer effects. If you care about the
2781 // perf of the untyped trie case and have better ideas, please try
2782 // something better.
2783 #[cfg(not(icu4x_unstable_fast_trie_only))]
2784 {composition.decomposition.trie.get32(upcoming32)}
2785 #[cfg(icu4x_unstable_fast_trie_only)]
2786 {composition.decomposition.trie.get32_supplementary(upcoming32)}
2787 };
2788 if likely(potential_passthrough_and_cannot_combine_backwards_impl(trie_value)) {
2789 // Fast-track succeeded!
2790 continue 'fast;
2791 }
2792 break 'surrogateloop;
2793 // } else {
2794 // code_unit_iter = iter_backup;
2795 }
2796 }
2797 }
2798 // unpaired surrogate
2799 upcoming32 = 0xFFFD; // Safe value for `char::from_u32_unchecked` and matches later potential error check.
2800 // trie_value already holds a decomposition to U+FFFD.
2801 debug_assert_eq!(trie_value, NON_ROUND_TRIP_MARKER | BACKWARD_COMBINING_MARKER | 0xFFFD);
2802 break 'surrogateloop;
2803 }
2804
2805 // SAFETY: upcoming32 can no longer be a surrogate.
2806 let upcoming = unsafe { char::from_u32_unchecked(upcoming32) };
2807 let upcoming_with_trie_value = CharacterAndTrieValue::new(upcoming, trie_value);
2808 // We need to fall off the fast path.
2809 composition.decomposition.pending = Some(upcoming_with_trie_value);
2810 let Some(consumed_so_far_slice) = pending_slice.get(..pending_slice.len() -
2811 // code_unit_iter.as_slice().len()
2812 // SAFETY: `ptr` and `end` have been derived from the same allocation
2813 // and `ptr` is never greater than `end`.
2814 unsafe { end.offset_from(ptr) as usize }
2815 - upcoming.len_utf16()) else {
2816 // If we ever come here, it's a bug, but let's avoid panic code paths in release builds.
2817 debug_assert!(false);
2818 // Throw away the results of the fast path.
2819 break 'fastwrap;
2820 };
2821 let mut consumed_so_far = consumed_so_far_slice.chars();
2822 let Some(c_from_back) = consumed_so_far.next_back() else {
2823 // If we ever come here, it's a bug, but let's avoid panic code paths in release builds.
2824 debug_assert!(false);
2825 // Throw away the results of the fast path.
2826 break 'fastwrap;
2827 };
2828 // TODO: If the previous character was below the passthrough bound,
2829 // we really need to read from the trie. Otherwise, we could maintain
2830 // the most-recent trie value. Need to measure what's more expensive:
2831 // Remembering the trie value on each iteration or re-reading the
2832 // last one after the fast-track run.
2833 undecomposed_starter = composition.decomposition.attach_trie_value(c_from_back);
2834 sink.write_slice(consumed_so_far.as_slice())?;
2835 break 'fast;
2836 }
2837 // End of stream
2838 sink.write_slice(pending_slice)?;
2839 return Ok(());
2840 }
2841 // Sync the main iterator
2842 // composition.decomposition.delegate = code_unit_iter.as_slice().chars();
2843 // SAFETY: `ptr` and `end` have been derive from the same allocation
2844 // and `ptr` is never greater than `end`.
2845 composition.decomposition.delegate = unsafe { core::slice::from_raw_parts(ptr, end.offset_from(ptr) as usize) }.chars();
2846 break 'fastwrap;
2847 }
2848 },
2849 text,
2850 sink,
2851 composition,
2852 composition_passthrough_bound,
2853 undecomposed_starter,
2854 pending_slice,
2855 len_utf16,
2856 );
2857}
2858
2859/// A normalizer for performing composing normalization.
2860#[derive(Debug)]
2861pub struct ComposingNormalizer {
2862 decomposing_normalizer: DecomposingNormalizer,
2863 canonical_compositions: DataPayload<NormalizerNfcV1>,
2864}
2865
2866impl ComposingNormalizer {
2867 /// Constructs a borrowed version of this type for more efficient querying.
2868 pub fn as_borrowed(&self) -> ComposingNormalizerBorrowed<'_> {
2869 ComposingNormalizerBorrowed {
2870 decomposing_normalizer: self.decomposing_normalizer.as_borrowed(),
2871 canonical_compositions: self.canonical_compositions.get(),
2872 }
2873 }
2874
2875 /// NFC constructor using compiled data.
2876 ///
2877 /// ✨ *Enabled with the `compiled_data` Cargo feature.*
2878 ///
2879 /// [📚 Help choosing a constructor](icu_provider::constructors)
2880 #[cfg(feature = "compiled_data")]
2881 pub const fn new_nfc() -> ComposingNormalizerBorrowed<'static> {
2882 ComposingNormalizerBorrowed::new_nfc()
2883 }
2884
2885 icu_provider::gen_buffer_data_constructors!(
2886 () -> error: DataError,
2887 functions: [
2888 new_nfc: skip,
2889 try_new_nfc_with_buffer_provider,
2890 try_new_nfc_unstable,
2891 Self,
2892 ]
2893 );
2894
2895 #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_nfc)]
2896 pub fn try_new_nfc_unstable<D>(provider: &D) -> Result<Self, DataError>
2897 where
2898 D: DataProvider<NormalizerNfdDataV1>
2899 + DataProvider<NormalizerNfdTablesV1>
2900 + DataProvider<NormalizerNfcV1>
2901 + ?Sized,
2902 {
2903 let decomposing_normalizer = DecomposingNormalizer::try_new_nfd_unstable(provider)?;
2904
2905 let canonical_compositions: DataPayload<NormalizerNfcV1> =
2906 provider.load(Default::default())?.payload;
2907
2908 Ok(ComposingNormalizer {
2909 decomposing_normalizer,
2910 canonical_compositions,
2911 })
2912 }
2913
2914 /// NFKC constructor using compiled data.
2915 ///
2916 /// ✨ *Enabled with the `compiled_data` Cargo feature.*
2917 ///
2918 /// [📚 Help choosing a constructor](icu_provider::constructors)
2919 #[cfg(feature = "compiled_data")]
2920 pub const fn new_nfkc() -> ComposingNormalizerBorrowed<'static> {
2921 ComposingNormalizerBorrowed::new_nfkc()
2922 }
2923
2924 icu_provider::gen_buffer_data_constructors!(
2925 () -> error: DataError,
2926 functions: [
2927 new_nfkc: skip,
2928 try_new_nfkc_with_buffer_provider,
2929 try_new_nfkc_unstable,
2930 Self,
2931 ]
2932 );
2933
2934 #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_nfkc)]
2935 pub fn try_new_nfkc_unstable<D>(provider: &D) -> Result<Self, DataError>
2936 where
2937 D: DataProvider<NormalizerNfkdDataV1>
2938 + DataProvider<NormalizerNfdTablesV1>
2939 + DataProvider<NormalizerNfkdTablesV1>
2940 + DataProvider<NormalizerNfcV1>
2941 + ?Sized,
2942 {
2943 let decomposing_normalizer = DecomposingNormalizer::try_new_nfkd_unstable(provider)?;
2944
2945 let canonical_compositions: DataPayload<NormalizerNfcV1> =
2946 provider.load(Default::default())?.payload;
2947
2948 Ok(ComposingNormalizer {
2949 decomposing_normalizer,
2950 canonical_compositions,
2951 })
2952 }
2953
2954 #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_uts46)]
2955 pub(crate) fn try_new_uts46_unstable<D>(provider: &D) -> Result<Self, DataError>
2956 where
2957 D: DataProvider<NormalizerUts46DataV1>
2958 + DataProvider<NormalizerNfdTablesV1>
2959 + DataProvider<NormalizerNfkdTablesV1>
2960 // UTS 46 tables merged into CompatibilityDecompositionTablesV1
2961 + DataProvider<NormalizerNfcV1>
2962 + ?Sized,
2963 {
2964 let decomposing_normalizer =
2965 DecomposingNormalizer::try_new_uts46_decomposed_unstable(provider)?;
2966
2967 let canonical_compositions: DataPayload<NormalizerNfcV1> =
2968 provider.load(Default::default())?.payload;
2969
2970 Ok(ComposingNormalizer {
2971 decomposing_normalizer,
2972 canonical_compositions,
2973 })
2974 }
2975}
2976
2977#[cfg(feature = "utf16_iter")]
2978struct IsNormalizedSinkUtf16<'a> {
2979 expect: &'a [u16],
2980}
2981
2982#[cfg(feature = "utf16_iter")]
2983impl<'a> IsNormalizedSinkUtf16<'a> {
2984 pub fn new(slice: &'a [u16]) -> Self {
2985 IsNormalizedSinkUtf16 { expect: slice }
2986 }
2987 pub fn remaining_len(&self) -> usize {
2988 self.expect.len()
2989 }
2990}
2991
2992#[cfg(feature = "utf16_iter")]
2993impl write16::Write16 for IsNormalizedSinkUtf16<'_> {
2994 fn write_slice(&mut self, s: &[u16]) -> core::fmt::Result {
2995 // We know that if we get a slice, it's a pass-through,
2996 // so we can compare addresses. Indexing is OK, because
2997 // an indexing failure would be a code bug rather than
2998 // an input or data issue.
2999 #[expect(clippy::indexing_slicing)]
3000 if core::ptr::eq(s.as_ptr(), self.expect.as_ptr()) {
3001 self.expect = &self.expect[s.len()..];
3002 Ok(())
3003 } else {
3004 Err(core::fmt::Error {})
3005 }
3006 }
3007
3008 fn write_char(&mut self, c: char) -> core::fmt::Result {
3009 let mut iter = self.expect.chars();
3010 if iter.next() == Some(c) {
3011 self.expect = iter.as_slice();
3012 Ok(())
3013 } else {
3014 Err(core::fmt::Error {})
3015 }
3016 }
3017}
3018
3019#[cfg(feature = "utf8_iter")]
3020struct IsNormalizedSinkUtf8<'a> {
3021 expect: &'a [u8],
3022}
3023
3024#[cfg(feature = "utf8_iter")]
3025impl<'a> IsNormalizedSinkUtf8<'a> {
3026 pub fn new(slice: &'a [u8]) -> Self {
3027 IsNormalizedSinkUtf8 { expect: slice }
3028 }
3029 pub fn remaining_len(&self) -> usize {
3030 self.expect.len()
3031 }
3032}
3033
3034#[cfg(feature = "utf8_iter")]
3035impl core::fmt::Write for IsNormalizedSinkUtf8<'_> {
3036 fn write_str(&mut self, s: &str) -> core::fmt::Result {
3037 // We know that if we get a slice, it's a pass-through,
3038 // so we can compare addresses. Indexing is OK, because
3039 // an indexing failure would be a code bug rather than
3040 // an input or data issue.
3041 #[expect(clippy::indexing_slicing)]
3042 if core::ptr::eq(s.as_ptr(), self.expect.as_ptr()) {
3043 self.expect = &self.expect[s.len()..];
3044 Ok(())
3045 } else {
3046 Err(core::fmt::Error {})
3047 }
3048 }
3049
3050 fn write_char(&mut self, c: char) -> core::fmt::Result {
3051 let mut iter = self.expect.chars();
3052 if iter.next() == Some(c) {
3053 self.expect = iter.as_slice();
3054 Ok(())
3055 } else {
3056 Err(core::fmt::Error {})
3057 }
3058 }
3059}
3060
3061struct IsNormalizedSinkStr<'a> {
3062 expect: &'a str,
3063}
3064
3065impl<'a> IsNormalizedSinkStr<'a> {
3066 pub fn new(slice: &'a str) -> Self {
3067 IsNormalizedSinkStr { expect: slice }
3068 }
3069 pub fn remaining_len(&self) -> usize {
3070 self.expect.len()
3071 }
3072}
3073
3074impl core::fmt::Write for IsNormalizedSinkStr<'_> {
3075 fn write_str(&mut self, s: &str) -> core::fmt::Result {
3076 // We know that if we get a slice, it's a pass-through,
3077 // so we can compare addresses. Indexing is OK, because
3078 // an indexing failure would be a code bug rather than
3079 // an input or data issue.
3080 if core::ptr::eq(s.as_ptr(), self.expect.as_ptr()) {
3081 self.expect = &self.expect[s.len()..];
3082 Ok(())
3083 } else {
3084 Err(core::fmt::Error {})
3085 }
3086 }
3087
3088 fn write_char(&mut self, c: char) -> core::fmt::Result {
3089 let mut iter = self.expect.chars();
3090 if iter.next() == Some(c) {
3091 self.expect = iter.as_str();
3092 Ok(())
3093 } else {
3094 Err(core::fmt::Error {})
3095 }
3096 }
3097}