icu_normalizer/lib.rs
1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5// https://github.com/unicode-org/icu4x/blob/main/documents/process/boilerplate.md#library-annotations
6#![cfg_attr(not(any(test, doc)), no_std)]
7#![cfg_attr(
8 not(test),
9 deny(
10 clippy::indexing_slicing,
11 clippy::unwrap_used,
12 clippy::expect_used,
13 clippy::panic,
14 )
15)]
16#![warn(missing_docs)]
17
18//! Normalizing text into Unicode Normalization Forms.
19//!
20//! This module is published as its own crate ([`icu_normalizer`](https://docs.rs/icu_normalizer/latest/icu_normalizer/))
21//! and as part of the [`icu`](https://docs.rs/icu/latest/icu/) crate. See the latter for more details on the ICU4X project.
22//!
23//! # Functionality
24//!
25//! The top level of the crate provides normalization of input into the four normalization forms defined in [UAX #15: Unicode
26//! Normalization Forms](https://www.unicode.org/reports/tr15/): NFC, NFD, NFKC, and NFKD.
27//!
28//! Three kinds of contiguous inputs are supported: known-well-formed UTF-8 (`&str`), potentially-not-well-formed UTF-8,
29//! and potentially-not-well-formed UTF-16. Additionally, an iterator over `char` can be wrapped in a normalizing iterator.
30//!
31//! The `uts46` module provides the combination of mapping and normalization operations for [UTS #46: Unicode IDNA
32//! Compatibility Processing](https://www.unicode.org/reports/tr46/). This functionality is not meant to be used by
33//! applications directly. Instead, it is meant as a building block for a full implementation of UTS #46, such as the
34//! [`idna`](https://docs.rs/idna/latest/idna/) crate.
35//!
36//! The `properties` module provides the non-recursive canonical decomposition operation on a per `char` basis and
37//! the canonical compositon operation given two `char`s. It also provides access to the Canonical Combining Class
38//! property. These operations are primarily meant for [HarfBuzz](https://harfbuzz.github.io/), the types
39//! [`CanonicalComposition`](properties::CanonicalComposition), [`CanonicalDecomposition`](properties::CanonicalDecomposition),
40//! and [`CanonicalCombiningClassMap`](properties::CanonicalCombiningClassMap) implement the [`harfbuzz_traits`] if
41//! the `harfbuzz_traits` Cargo feature is enabled.
42//!
43//! Notably, this normalizer does _not_ provide the normalization “quick check” that can result in “maybe” in
44//! addition to “yes” and “no”. The normalization checks provided by this crate always give a definitive
45//! non-“maybe” answer.
46//!
47//! # Examples
48//!
49//! ```
50//! let nfc = icu_normalizer::ComposingNormalizerBorrowed::new_nfc();
51//! assert_eq!(nfc.normalize("a\u{0308}"), "ä");
52//! assert!(nfc.is_normalized("ä"));
53//!
54//! let nfd = icu_normalizer::DecomposingNormalizerBorrowed::new_nfd();
55//! assert_eq!(nfd.normalize("ä"), "a\u{0308}");
56//! assert!(!nfd.is_normalized("ä"));
57//! ```
58
59extern crate alloc;
60
61// TODO: The plan is to replace
62// `#[cfg(not(icu4x_unstable_fast_trie_only))]`
63// with
64// `#[cfg(feature = "serde")]`
65// and
66// `#[cfg(icu4x_unstable_fast_trie_only)]`
67// with
68// `#[cfg(not(feature = "serde"))]`
69//
70// Before doing so:
71// * The type of the UTS 46 trie needs to be
72// disentangled from the type of the NFD/NFKD tries.
73// This will involve a more generic iterator hidden
74// inside the public iterator types.
75// * datagen needs to emit fast-mode tries for the
76// NFD and NFKD tries.
77// * The markers and possibly the data struct type
78// for NFD and NFKD need to be revised per policy.
79
80#[cfg(not(icu4x_unstable_fast_trie_only))]
81type Trie<'trie> = CodePointTrie<'trie, u32>;
82
83#[cfg(icu4x_unstable_fast_trie_only)]
84type Trie<'trie> = FastCodePointTrie<'trie, u32>;
85
86// We don't depend on icu_properties to minimize deps, but we want to be able
87// to ensure we're using the right CCC values
88macro_rules! ccc {
89 ($name:ident, $num:expr) => {
90 const {
91 #[cfg(feature = "icu_properties")]
92 if icu_properties::props::CanonicalCombiningClass::$name.to_icu4c_value() != $num {
93 panic!("icu_normalizer has incorrect ccc values")
94 }
95 CanonicalCombiningClass::from_icu4c_value($num)
96 }
97 };
98}
99
100#[cfg(feature = "harfbuzz_traits")]
101mod harfbuzz;
102pub mod properties;
103pub mod provider;
104pub mod uts46;
105
106use crate::provider::CanonicalCompositions;
107use crate::provider::DecompositionData;
108use crate::provider::NormalizerNfdDataV1;
109use crate::provider::NormalizerNfkdDataV1;
110use crate::provider::NormalizerUts46DataV1;
111use alloc::borrow::Cow;
112use alloc::string::String;
113use core::char::REPLACEMENT_CHARACTER;
114use icu_collections::char16trie::Char16Trie;
115use icu_collections::char16trie::Char16TrieIterator;
116use icu_collections::char16trie::TrieResult;
117#[cfg(not(icu4x_unstable_fast_trie_only))]
118use icu_collections::codepointtrie::CodePointTrie;
119#[cfg(icu4x_unstable_fast_trie_only)]
120use icu_collections::codepointtrie::FastCodePointTrie;
121#[cfg(icu4x_unstable_fast_trie_only)]
122use icu_collections::codepointtrie::TypedCodePointTrie;
123#[cfg(feature = "icu_properties")]
124use icu_properties::props::CanonicalCombiningClass;
125use icu_provider::prelude::*;
126use provider::DecompositionTables;
127use provider::NormalizerNfcV1;
128use provider::NormalizerNfdTablesV1;
129use provider::NormalizerNfkdTablesV1;
130use smallvec::SmallVec;
131#[cfg(feature = "utf16_iter")]
132use utf16_iter::Utf16CharsEx;
133#[cfg(feature = "utf8_iter")]
134use utf8_iter::Utf8CharsEx;
135use zerovec::{zeroslice, ZeroSlice};
136
137// The optimizations in the area where `likely` is used
138// are extremely brittle. `likely` is useful in the typed-trie
139// case on the UTF-16 fast path, but in order not to disturb
140// the untyped-trie case on the UTF-16 fast path, make the
141// annotations no-ops in the untyped-trie case.
142
143// `cold_path` and `likely` come from
144// https://github.com/rust-lang/hashbrown/commit/64bd7db1d1b148594edfde112cdb6d6260e2cfc3 .
145// See https://github.com/rust-lang/hashbrown/commit/64bd7db1d1b148594edfde112cdb6d6260e2cfc3#commitcomment-164768806
146// for permission to relicense under Unicode-3.0.
147
148#[cfg(all(icu4x_unstable_fast_trie_only, feature = "utf16_iter"))]
149#[inline(always)]
150#[cold]
151fn cold_path() {}
152
153#[cfg(all(icu4x_unstable_fast_trie_only, feature = "utf16_iter"))]
154#[inline(always)]
155pub(crate) fn likely(b: bool) -> bool {
156 if b {
157 true
158 } else {
159 cold_path();
160 false
161 }
162}
163
164// End import from https://github.com/rust-lang/hashbrown/commit/64bd7db1d1b148594edfde112cdb6d6260e2cfc3 .
165
166/// No-op for typed trie case.
167#[cfg(all(not(icu4x_unstable_fast_trie_only), feature = "utf16_iter"))]
168#[inline(always)]
169fn likely(b: bool) -> bool {
170 b
171}
172
173// This type exists as a shim for `icu_properties` `CanonicalCombiningClass` when the crate is disabled
174// It should not be exposed to users.
175#[cfg(not(feature = "icu_properties"))]
176#[derive(Copy, Clone, Eq, PartialEq, PartialOrd, Ord)]
177struct CanonicalCombiningClass(pub(crate) u8);
178
179#[cfg(not(feature = "icu_properties"))]
180impl CanonicalCombiningClass {
181 const fn from_icu4c_value(v: u8) -> Self {
182 Self(v)
183 }
184 const fn to_icu4c_value(self) -> u8 {
185 self.0
186 }
187}
188
189const CCC_NOT_REORDERED: CanonicalCombiningClass = ccc!(NotReordered, 0);
190const CCC_ABOVE: CanonicalCombiningClass = ccc!(Above, 230);
191
192/// Treatment of the ignorable marker (0xFFFFFFFF) in data.
193#[derive(Debug, PartialEq, Eq)]
194enum IgnorableBehavior {
195 /// 0xFFFFFFFF in data is not supported.
196 Unsupported,
197 /// Ignorables are ignored.
198 Ignored,
199 /// Ignorables are treated as singleton decompositions
200 /// to the REPLACEMENT CHARACTER.
201 ReplacementCharacter,
202}
203
204/// Marker for UTS 46 ignorables.
205///
206/// See trie-value-format.md
207const IGNORABLE_MARKER: u32 = 0xFFFFFFFF;
208
209/// Marker that the decomposition does not round trip via NFC.
210///
211/// See trie-value-format.md
212const NON_ROUND_TRIP_MARKER: u32 = 1 << 30;
213
214/// Marker that the first character of the decomposition
215/// can combine backwards.
216///
217/// See trie-value-format.md
218const BACKWARD_COMBINING_MARKER: u32 = 1 << 31;
219
220/// Mask for the bits have to be zero for this to be a BMP
221/// singleton decomposition, or value baked into the surrogate
222/// range.
223///
224/// See trie-value-format.md
225const HIGH_ZEROS_MASK: u32 = 0x3FFF0000;
226
227/// Mask for the bits have to be zero for this to be a complex
228/// decomposition.
229///
230/// See trie-value-format.md
231const LOW_ZEROS_MASK: u32 = 0xFFE0;
232
233/// Checks if a trie value carries a (non-zero) canonical
234/// combining class.
235///
236/// See trie-value-format.md
237#[inline]
238fn trie_value_has_ccc(trie_value: u32) -> bool {
239 (trie_value & 0x3FFFFE00) == 0xD800
240}
241
242/// Checks if the trie signifies a special non-starter decomposition.
243///
244/// See trie-value-format.md
245fn trie_value_indicates_special_non_starter_decomposition(trie_value: u32) -> bool {
246 (trie_value & 0x3FFFFF00) == 0xD900
247}
248
249/// Checks if a trie value signifies a character whose decomposition
250/// starts with a non-starter.
251///
252/// See trie-value-format.md
253fn decomposition_starts_with_non_starter(trie_value: u32) -> bool {
254 trie_value_has_ccc(trie_value)
255}
256
257/// Extracts a canonical combining class (possibly zero) from a trie value.
258///
259/// See trie-value-format.md
260fn ccc_from_trie_value(trie_value: u32) -> CanonicalCombiningClass {
261 if trie_value_has_ccc(trie_value) {
262 CanonicalCombiningClass::from_icu4c_value(trie_value as u8)
263 } else {
264 CCC_NOT_REORDERED
265 }
266}
267
268/// The tail (everything after the first character) of the NFKD form U+FDFA
269/// as 16-bit units.
270static FDFA_NFKD: [u16; 17] = [
271 0x644, 0x649, 0x20, 0x627, 0x644, 0x644, 0x647, 0x20, 0x639, 0x644, 0x64A, 0x647, 0x20, 0x648,
272 0x633, 0x644, 0x645,
273];
274
275/// Marker value for U+FDFA in NFKD. (Unified with Hangul syllable marker,
276/// but they differ by `NON_ROUND_TRIP_MARKER`.)
277///
278/// See trie-value-format.md
279const FDFA_MARKER: u16 = 1;
280
281// These constants originate from page 143 of Unicode 14.0
282/// Syllable base
283const HANGUL_S_BASE: u32 = 0xAC00;
284/// Lead jamo base
285const HANGUL_L_BASE: u32 = 0x1100;
286/// Vowel jamo base
287const HANGUL_V_BASE: u32 = 0x1161;
288/// Trail jamo base (deliberately off by one to account for the absence of a trail)
289const HANGUL_T_BASE: u32 = 0x11A7;
290/// Lead jamo count
291const HANGUL_L_COUNT: u32 = 19;
292/// Vowel jamo count
293const HANGUL_V_COUNT: u32 = 21;
294/// Trail jamo count (deliberately off by one to account for the absence of a trail)
295const HANGUL_T_COUNT: u32 = 28;
296/// Vowel jamo count times trail jamo count
297const HANGUL_N_COUNT: u32 = 588;
298/// Syllable count
299const HANGUL_S_COUNT: u32 = 11172;
300
301/// One past the conjoining jamo block
302const HANGUL_JAMO_LIMIT: u32 = 0x1200;
303
304/// If `opt` is `Some`, unwrap it. If `None`, panic if debug assertions
305/// are enabled and return `default` if debug assertions are not enabled.
306///
307/// Use this only if the only reason why `opt` could be `None` is bogus
308/// data from the provider.
309#[inline(always)]
310fn unwrap_or_gigo<T>(opt: Option<T>, default: T) -> T {
311 if let Some(val) = opt {
312 val
313 } else {
314 // GIGO case
315 debug_assert!(false);
316 default
317 }
318}
319
320/// Convert a `u32` _obtained from data provider data_ to `char`.
321#[inline(always)]
322fn char_from_u32(u: u32) -> char {
323 unwrap_or_gigo(core::char::from_u32(u), REPLACEMENT_CHARACTER)
324}
325
326/// Convert a `u16` _obtained from data provider data_ to `char`.
327#[inline(always)]
328fn char_from_u16(u: u16) -> char {
329 char_from_u32(u32::from(u))
330}
331
332const EMPTY_U16: &ZeroSlice<u16> = zeroslice![];
333
334const EMPTY_CHAR: &ZeroSlice<char> = zeroslice![];
335
336#[inline(always)]
337fn in_inclusive_range(c: char, start: char, end: char) -> bool {
338 u32::from(c).wrapping_sub(u32::from(start)) <= (u32::from(end) - u32::from(start))
339}
340
341#[inline(always)]
342#[cfg(feature = "utf16_iter")]
343fn in_inclusive_range16(u: u16, start: u16, end: u16) -> bool {
344 u.wrapping_sub(start) <= (end - start)
345}
346
347/// Performs canonical composition (including Hangul) on a pair of
348/// characters or returns `None` if these characters don't compose.
349/// Composition exclusions are taken into account.
350#[inline]
351fn compose(iter: Char16TrieIterator, starter: char, second: char) -> Option<char> {
352 let v = u32::from(second).wrapping_sub(HANGUL_V_BASE);
353 if v >= HANGUL_JAMO_LIMIT - HANGUL_V_BASE {
354 return compose_non_hangul(iter, starter, second);
355 }
356 if v < HANGUL_V_COUNT {
357 let l = u32::from(starter).wrapping_sub(HANGUL_L_BASE);
358 if l < HANGUL_L_COUNT {
359 let lv = l * HANGUL_N_COUNT + v * HANGUL_T_COUNT;
360 // Safe, because the inputs are known to be in range.
361 return Some(unsafe { char::from_u32_unchecked(HANGUL_S_BASE + lv) });
362 }
363 return None;
364 }
365 if in_inclusive_range(second, '\u{11A8}', '\u{11C2}') {
366 let lv = u32::from(starter).wrapping_sub(HANGUL_S_BASE);
367 if lv < HANGUL_S_COUNT && lv % HANGUL_T_COUNT == 0 {
368 let lvt = lv + (u32::from(second) - HANGUL_T_BASE);
369 // Safe, because the inputs are known to be in range.
370 return Some(unsafe { char::from_u32_unchecked(HANGUL_S_BASE + lvt) });
371 }
372 }
373 None
374}
375
376/// Performs (non-Hangul) canonical composition on a pair of characters
377/// or returns `None` if these characters don't compose. Composition
378/// exclusions are taken into account.
379fn compose_non_hangul(mut iter: Char16TrieIterator, starter: char, second: char) -> Option<char> {
380 // To make the trie smaller, the pairs are stored second character first.
381 // Given how this method is used in ways where it's known that `second`
382 // is or isn't a starter. We could potentially split the trie into two
383 // tries depending on whether `second` is a starter.
384 match iter.next(second) {
385 TrieResult::NoMatch => None,
386 TrieResult::NoValue => match iter.next(starter) {
387 TrieResult::NoMatch => None,
388 TrieResult::FinalValue(i) => {
389 if let Some(c) = char::from_u32(i as u32) {
390 Some(c)
391 } else {
392 // GIGO case
393 debug_assert!(false);
394 None
395 }
396 }
397 TrieResult::NoValue | TrieResult::Intermediate(_) => {
398 // GIGO case
399 debug_assert!(false);
400 None
401 }
402 },
403 TrieResult::FinalValue(_) | TrieResult::Intermediate(_) => {
404 // GIGO case
405 debug_assert!(false);
406 None
407 }
408 }
409}
410
411/// See trie-value-format.md
412#[inline(always)]
413fn starter_and_decomposes_to_self_impl(trie_val: u32) -> bool {
414 // The REPLACEMENT CHARACTER has `NON_ROUND_TRIP_MARKER` set,
415 // and this function needs to ignore that.
416 (trie_val & !(BACKWARD_COMBINING_MARKER | NON_ROUND_TRIP_MARKER)) == 0
417}
418
419/// See trie-value-format.md
420#[inline(always)]
421fn potential_passthrough_and_cannot_combine_backwards_impl(trie_val: u32) -> bool {
422 (trie_val & (NON_ROUND_TRIP_MARKER | BACKWARD_COMBINING_MARKER)) == 0
423}
424
425/// Struct for holding together a character and the value
426/// looked up for it from the NFD trie in a more explicit
427/// way than an anonymous pair.
428/// Also holds a flag about the supplementary-trie provenance.
429#[derive(Debug, PartialEq, Eq)]
430struct CharacterAndTrieValue {
431 character: char,
432 /// See trie-value-format.md
433 trie_val: u32,
434}
435
436impl CharacterAndTrieValue {
437 #[inline(always)]
438 pub fn new(c: char, trie_value: u32) -> Self {
439 CharacterAndTrieValue {
440 character: c,
441 trie_val: trie_value,
442 }
443 }
444
445 #[inline(always)]
446 pub fn starter_and_decomposes_to_self(&self) -> bool {
447 starter_and_decomposes_to_self_impl(self.trie_val)
448 }
449
450 /// See trie-value-format.md
451 #[inline(always)]
452 #[cfg(feature = "utf8_iter")]
453 pub fn starter_and_decomposes_to_self_except_replacement(&self) -> bool {
454 // This intentionally leaves `NON_ROUND_TRIP_MARKER` in the value
455 // to be compared with zero. U+FFFD has that flag set despite really
456 // being being round-tripping in order to make UTF-8 errors
457 // ineligible for passthrough.
458 (self.trie_val & !BACKWARD_COMBINING_MARKER) == 0
459 }
460
461 /// See trie-value-format.md
462 #[inline(always)]
463 pub fn can_combine_backwards(&self) -> bool {
464 (self.trie_val & BACKWARD_COMBINING_MARKER) != 0
465 }
466 /// See trie-value-format.md
467 #[inline(always)]
468 pub fn potential_passthrough(&self) -> bool {
469 (self.trie_val & NON_ROUND_TRIP_MARKER) == 0
470 }
471 /// See trie-value-format.md
472 #[inline(always)]
473 pub fn potential_passthrough_and_cannot_combine_backwards(&self) -> bool {
474 potential_passthrough_and_cannot_combine_backwards_impl(self.trie_val)
475 }
476}
477
478/// Pack a `char` and a `CanonicalCombiningClass` in
479/// 32 bits (the former in the lower 24 bits and the
480/// latter in the high 8 bits). The latter can be
481/// initialized to 0xFF upon creation, in which case
482/// it can be actually set later by calling
483/// `set_ccc_from_trie_if_not_already_set`. This is
484/// a micro optimization to avoid the Canonical
485/// Combining Class trie lookup when there is only
486/// one combining character in a sequence. This type
487/// is intentionally non-`Copy` to get compiler help
488/// in making sure that the class is set on the
489/// instance on which it is intended to be set
490/// and not on a temporary copy.
491///
492/// Note that 0xFF is won't be assigned to an actual
493/// canonical combining class per definition D104
494/// in The Unicode Standard.
495//
496// NOTE: The Pernosco debugger has special knowledge
497// of this struct. Please do not change the bit layout
498// or the crate-module-qualified name of this struct
499// without coordination.
500#[derive(Debug)]
501struct CharacterAndClass(u32);
502
503impl CharacterAndClass {
504 pub fn new(c: char, ccc: CanonicalCombiningClass) -> Self {
505 CharacterAndClass(u32::from(c) | (u32::from(ccc.to_icu4c_value()) << 24))
506 }
507 pub fn new_with_placeholder(c: char) -> Self {
508 CharacterAndClass(u32::from(c) | ((0xFF) << 24))
509 }
510 pub fn new_with_trie_value(c_tv: CharacterAndTrieValue) -> Self {
511 Self::new(c_tv.character, ccc_from_trie_value(c_tv.trie_val))
512 }
513 pub fn new_starter(c: char) -> Self {
514 CharacterAndClass(u32::from(c))
515 }
516 /// This method must exist for Pernosco to apply its special rendering.
517 /// Also, this must not be dead code!
518 pub fn character(&self) -> char {
519 // Safe, because the low 24 bits came from a `char`
520 // originally.
521 unsafe { char::from_u32_unchecked(self.0 & 0xFFFFFF) }
522 }
523 /// This method must exist for Pernosco to apply its special rendering.
524 pub fn ccc(&self) -> CanonicalCombiningClass {
525 CanonicalCombiningClass::from_icu4c_value((self.0 >> 24) as u8)
526 }
527
528 pub fn character_and_ccc(&self) -> (char, CanonicalCombiningClass) {
529 (self.character(), self.ccc())
530 }
531 pub fn set_ccc_from_trie_if_not_already_set(&mut self, trie: &Trie) {
532 if self.0 >> 24 != 0xFF {
533 return;
534 }
535 let scalar = self.0 & 0xFFFFFF;
536 self.0 =
537 ((ccc_from_trie_value(trie.get32_u32(scalar)).to_icu4c_value() as u32) << 24) | scalar;
538 }
539}
540
541// This function exists as a borrow check helper.
542#[inline(always)]
543fn sort_slice_by_ccc(slice: &mut [CharacterAndClass], trie: &Trie) {
544 // We don't look up the canonical combining class for starters
545 // of for single combining characters between starters. When
546 // there's more than one combining character between starters,
547 // we look up the canonical combining class for each character
548 // exactly once.
549 if slice.len() < 2 {
550 return;
551 }
552 slice
553 .iter_mut()
554 .for_each(|cc| cc.set_ccc_from_trie_if_not_already_set(trie));
555 slice.sort_by_key(|cc| cc.ccc());
556}
557
558/// An iterator adaptor that turns an `Iterator` over `char` into
559/// a lazily-decomposed `char` sequence.
560#[derive(Debug)]
561pub struct Decomposition<'data, I>
562where
563 I: Iterator<Item = char>,
564{
565 delegate: I,
566 buffer: SmallVec<[CharacterAndClass; 17]>, // Enough to hold NFKD for U+FDFA
567 /// The index of the next item to be read from `buffer`.
568 /// The purpose if this index is to avoid having to move
569 /// the rest upon every read.
570 buffer_pos: usize,
571 // At the start of `next()` if not `None`, this is a pending unnormalized
572 // starter. When `Decomposition` appears alone, this is never a non-starter.
573 // However, when `Decomposition` appears inside a `Composition`, this
574 // may become a non-starter before `decomposing_next()` is called.
575 pending: Option<CharacterAndTrieValue>, // None at end of stream
576 // See trie-value-format.md
577 trie: &'data Trie<'data>,
578 scalars16: &'data ZeroSlice<u16>,
579 scalars24: &'data ZeroSlice<char>,
580 supplementary_scalars16: &'data ZeroSlice<u16>,
581 supplementary_scalars24: &'data ZeroSlice<char>,
582 /// The lowest character for which either of the following does
583 /// not hold:
584 /// 1. Decomposes to self.
585 /// 2. Decomposition starts with a non-starter
586 decomposition_passthrough_bound: u32, // never above 0xC0
587 ignorable_behavior: IgnorableBehavior, // Arguably should be a type parameter
588}
589
590impl<'data, I> Decomposition<'data, I>
591where
592 I: Iterator<Item = char>,
593{
594 /// Constructs a decomposing iterator adapter from a delegate
595 /// iterator and references to the necessary data, without
596 /// supplementary data.
597 ///
598 /// Use `DecomposingNormalizer::normalize_iter()` instead unless
599 /// there's a good reason to use this constructor directly.
600 ///
601 /// Public but hidden in order to be able to use this from the
602 /// collator.
603 #[doc(hidden)] // used in collator
604 pub fn new(
605 delegate: I,
606 decompositions: &'data DecompositionData,
607 tables: &'data DecompositionTables,
608 ) -> Self {
609 Self::new_with_supplements(
610 delegate,
611 decompositions,
612 tables,
613 None,
614 0xC0,
615 IgnorableBehavior::Unsupported,
616 )
617 }
618
619 /// Constructs a decomposing iterator adapter from a delegate
620 /// iterator and references to the necessary data, including
621 /// supplementary data.
622 ///
623 /// Use `DecomposingNormalizer::normalize_iter()` instead unless
624 /// there's a good reason to use this constructor directly.
625 fn new_with_supplements(
626 delegate: I,
627 decompositions: &'data DecompositionData,
628 tables: &'data DecompositionTables,
629 supplementary_tables: Option<&'data DecompositionTables>,
630 decomposition_passthrough_bound: u8,
631 ignorable_behavior: IgnorableBehavior,
632 ) -> Self {
633 let mut ret = Decomposition::<I> {
634 delegate,
635 buffer: SmallVec::new(), // Normalized
636 buffer_pos: 0,
637 // Initialize with a placeholder starter in case
638 // the real stream starts with a non-starter.
639 pending: Some(CharacterAndTrieValue::new('\u{FFFF}', 0)),
640 #[allow(clippy::useless_conversion, clippy::expect_used)] // Expectation always succeeds when untyped tries are in use
641 trie: <&Trie>::try_from(&decompositions.trie).expect("Unexpected trie type in data"),
642 scalars16: &tables.scalars16,
643 scalars24: &tables.scalars24,
644 supplementary_scalars16: if let Some(supplementary) = supplementary_tables {
645 &supplementary.scalars16
646 } else {
647 EMPTY_U16
648 },
649 supplementary_scalars24: if let Some(supplementary) = supplementary_tables {
650 &supplementary.scalars24
651 } else {
652 EMPTY_CHAR
653 },
654 decomposition_passthrough_bound: u32::from(decomposition_passthrough_bound),
655 ignorable_behavior,
656 };
657 let _ = ret.next(); // Remove the U+FFFF placeholder
658 ret
659 }
660
661 fn push_decomposition16(
662 &mut self,
663 offset: usize,
664 len: usize,
665 only_non_starters_in_trail: bool,
666 slice16: &ZeroSlice<u16>,
667 ) -> (char, usize) {
668 let (starter, tail) = slice16
669 .get_subslice(offset..offset + len)
670 .and_then(|slice| slice.split_first())
671 .map_or_else(
672 || {
673 // GIGO case
674 debug_assert!(false);
675 (REPLACEMENT_CHARACTER, EMPTY_U16)
676 },
677 |(first, trail)| (char_from_u16(first), trail),
678 );
679 if only_non_starters_in_trail {
680 // All the rest are combining
681 self.buffer.extend(
682 tail.iter()
683 .map(|u| CharacterAndClass::new_with_placeholder(char_from_u16(u))),
684 );
685 (starter, 0)
686 } else {
687 let mut i = 0;
688 let mut combining_start = 0;
689 for u in tail.iter() {
690 let ch = char_from_u16(u);
691 let trie_value = self.trie.get(ch);
692 self.buffer.push(CharacterAndClass::new_with_trie_value(
693 CharacterAndTrieValue::new(ch, trie_value),
694 ));
695 i += 1;
696 // Half-width kana and iota subscript don't occur in the tails
697 // of these multicharacter decompositions.
698 if !decomposition_starts_with_non_starter(trie_value) {
699 combining_start = i;
700 }
701 }
702 (starter, combining_start)
703 }
704 }
705
706 fn push_decomposition32(
707 &mut self,
708 offset: usize,
709 len: usize,
710 only_non_starters_in_trail: bool,
711 slice32: &ZeroSlice<char>,
712 ) -> (char, usize) {
713 let (starter, tail) = slice32
714 .get_subslice(offset..offset + len)
715 .and_then(|slice| slice.split_first())
716 .unwrap_or_else(|| {
717 // GIGO case
718 debug_assert!(false);
719 (REPLACEMENT_CHARACTER, EMPTY_CHAR)
720 });
721 if only_non_starters_in_trail {
722 // All the rest are combining
723 self.buffer
724 .extend(tail.iter().map(CharacterAndClass::new_with_placeholder));
725 (starter, 0)
726 } else {
727 let mut i = 0;
728 let mut combining_start = 0;
729 for ch in tail.iter() {
730 let trie_value = self.trie.get(ch);
731 self.buffer.push(CharacterAndClass::new_with_trie_value(
732 CharacterAndTrieValue::new(ch, trie_value),
733 ));
734 i += 1;
735 // Half-width kana and iota subscript don't occur in the tails
736 // of these multicharacter decompositions.
737 if !decomposition_starts_with_non_starter(trie_value) {
738 combining_start = i;
739 }
740 }
741 (starter, combining_start)
742 }
743 }
744
745 #[inline(always)]
746 fn attach_trie_value(&self, c: char) -> CharacterAndTrieValue {
747 CharacterAndTrieValue::new(c, self.trie.get(c))
748 }
749
750 fn delegate_next_no_pending(&mut self) -> Option<CharacterAndTrieValue> {
751 debug_assert!(self.pending.is_none());
752 loop {
753 let c = self.delegate.next()?;
754
755 // TODO(#2384): Measure if this check is actually an optimization.
756 if u32::from(c) < self.decomposition_passthrough_bound {
757 return Some(CharacterAndTrieValue::new(c, 0));
758 }
759
760 let trie_val = self.trie.get(c);
761 // TODO: Can we do something better about the cost of this branch in the
762 // non-UTS 46 case?
763 if trie_val == IGNORABLE_MARKER {
764 match self.ignorable_behavior {
765 IgnorableBehavior::Unsupported => {
766 debug_assert!(false);
767 }
768 IgnorableBehavior::ReplacementCharacter => {
769 return Some(CharacterAndTrieValue::new(
770 c,
771 u32::from(REPLACEMENT_CHARACTER) | NON_ROUND_TRIP_MARKER,
772 ));
773 }
774 IgnorableBehavior::Ignored => {
775 // Else ignore this character by reading the next one from the delegate.
776 continue;
777 }
778 }
779 }
780 return Some(CharacterAndTrieValue::new(c, trie_val));
781 }
782 }
783
784 fn delegate_next(&mut self) -> Option<CharacterAndTrieValue> {
785 if let Some(pending) = self.pending.take() {
786 // Only happens as part of `Composition` and as part of
787 // the contiguous-buffer methods of `DecomposingNormalizer`.
788 // I.e. does not happen as part of standalone iterator
789 // usage of `Decomposition`.
790 Some(pending)
791 } else {
792 self.delegate_next_no_pending()
793 }
794 }
795
796 fn decomposing_next(&mut self, c_and_trie_val: CharacterAndTrieValue) -> char {
797 let (starter, combining_start) = {
798 let c = c_and_trie_val.character;
799 // See trie-value-format.md
800 let decomposition = c_and_trie_val.trie_val;
801 // The REPLACEMENT CHARACTER has `NON_ROUND_TRIP_MARKER` set,
802 // and that flag needs to be ignored here.
803 if (decomposition & !(BACKWARD_COMBINING_MARKER | NON_ROUND_TRIP_MARKER)) == 0 {
804 // The character is its own decomposition
805 (c, 0)
806 } else {
807 let high_zeros = (decomposition & HIGH_ZEROS_MASK) == 0;
808 let low_zeros = (decomposition & LOW_ZEROS_MASK) == 0;
809 if !high_zeros && !low_zeros {
810 // Decomposition into two BMP characters: starter and non-starter
811 let starter = char_from_u32(decomposition & 0x7FFF);
812 let combining = char_from_u32((decomposition >> 15) & 0x7FFF);
813 self.buffer
814 .push(CharacterAndClass::new_with_placeholder(combining));
815 (starter, 0)
816 } else if high_zeros {
817 // Do the check by looking at `c` instead of looking at a marker
818 // in `singleton` below, because if we looked at the trie value,
819 // we'd still have to check that `c` is in the Hangul syllable
820 // range in order for the subsequent interpretations as `char`
821 // to be safe.
822 // Alternatively, `FDFA_MARKER` and the Hangul marker could
823 // be unified. That would add a branch for Hangul and remove
824 // a branch from singleton decompositions. It seems more
825 // important to favor Hangul syllables than singleton
826 // decompositions.
827 // Note that it would be valid to hoist this Hangul check
828 // one or even two steps earlier in this check hierarchy.
829 // Right now, it's assumed the kind of decompositions into
830 // BMP starter and non-starter, which occur in many languages,
831 // should be checked before Hangul syllables, which are about
832 // one language specifically. Hopefully, we get some
833 // instruction-level parallelism out of the disjointness of
834 // operations on `c` and `decomposition`.
835 let hangul_offset = u32::from(c).wrapping_sub(HANGUL_S_BASE); // SIndex in the spec
836 if hangul_offset < HANGUL_S_COUNT {
837 debug_assert_eq!(decomposition, 1);
838 // Hangul syllable
839 // The math here comes from page 144 of Unicode 14.0
840 let l = hangul_offset / HANGUL_N_COUNT;
841 let v = (hangul_offset % HANGUL_N_COUNT) / HANGUL_T_COUNT;
842 let t = hangul_offset % HANGUL_T_COUNT;
843
844 // The unsafe blocks here are OK, because the values stay
845 // within the Hangul jamo block and, therefore, the scalar
846 // value range by construction.
847 self.buffer.push(CharacterAndClass::new_starter(unsafe {
848 core::char::from_u32_unchecked(HANGUL_V_BASE + v)
849 }));
850 let first = unsafe { core::char::from_u32_unchecked(HANGUL_L_BASE + l) };
851 if t != 0 {
852 self.buffer.push(CharacterAndClass::new_starter(unsafe {
853 core::char::from_u32_unchecked(HANGUL_T_BASE + t)
854 }));
855 (first, 2)
856 } else {
857 (first, 1)
858 }
859 } else {
860 let singleton = decomposition as u16;
861 if singleton != FDFA_MARKER {
862 // Decomposition into one BMP character
863 let starter = char_from_u16(singleton);
864 (starter, 0)
865 } else {
866 // Special case for the NFKD form of U+FDFA.
867 self.buffer.extend(FDFA_NFKD.map(|u| {
868 // SAFETY: `FDFA_NFKD` is known not to contain
869 // surrogates.
870 CharacterAndClass::new_starter(unsafe {
871 core::char::from_u32_unchecked(u32::from(u))
872 })
873 }));
874 ('\u{0635}', 17)
875 }
876 }
877 } else {
878 debug_assert!(low_zeros);
879 // Only 12 of 14 bits used as of Unicode 16.
880 let offset = (((decomposition & !(0b11 << 30)) >> 16) as usize) - 1;
881 // Only 3 of 4 bits used as of Unicode 16.
882 let len_bits = decomposition & 0b1111;
883 let only_non_starters_in_trail = (decomposition & 0b10000) != 0;
884 if offset < self.scalars16.len() {
885 self.push_decomposition16(
886 offset,
887 (len_bits + 2) as usize,
888 only_non_starters_in_trail,
889 self.scalars16,
890 )
891 } else if offset < self.scalars16.len() + self.scalars24.len() {
892 self.push_decomposition32(
893 offset - self.scalars16.len(),
894 (len_bits + 1) as usize,
895 only_non_starters_in_trail,
896 self.scalars24,
897 )
898 } else if offset
899 < self.scalars16.len()
900 + self.scalars24.len()
901 + self.supplementary_scalars16.len()
902 {
903 self.push_decomposition16(
904 offset - (self.scalars16.len() + self.scalars24.len()),
905 (len_bits + 2) as usize,
906 only_non_starters_in_trail,
907 self.supplementary_scalars16,
908 )
909 } else {
910 self.push_decomposition32(
911 offset
912 - (self.scalars16.len()
913 + self.scalars24.len()
914 + self.supplementary_scalars16.len()),
915 (len_bits + 1) as usize,
916 only_non_starters_in_trail,
917 self.supplementary_scalars24,
918 )
919 }
920 }
921 }
922 };
923 // Either we're inside `Composition` or `self.pending.is_none()`.
924
925 self.gather_and_sort_combining(combining_start);
926 starter
927 }
928
929 fn gather_and_sort_combining(&mut self, combining_start: usize) {
930 // Not a `for` loop to avoid holding a mutable reference to `self` across
931 // the loop body.
932 while let Some(ch_and_trie_val) = self.delegate_next() {
933 if !trie_value_has_ccc(ch_and_trie_val.trie_val) {
934 self.pending = Some(ch_and_trie_val);
935 break;
936 } else if !trie_value_indicates_special_non_starter_decomposition(
937 ch_and_trie_val.trie_val,
938 ) {
939 self.buffer
940 .push(CharacterAndClass::new_with_trie_value(ch_and_trie_val));
941 } else {
942 // The Tibetan special cases are starters that decompose into non-starters.
943 let mapped = match ch_and_trie_val.character {
944 '\u{0340}' => {
945 // COMBINING GRAVE TONE MARK
946 CharacterAndClass::new('\u{0300}', CCC_ABOVE)
947 }
948 '\u{0341}' => {
949 // COMBINING ACUTE TONE MARK
950 CharacterAndClass::new('\u{0301}', CCC_ABOVE)
951 }
952 '\u{0343}' => {
953 // COMBINING GREEK KORONIS
954 CharacterAndClass::new('\u{0313}', CCC_ABOVE)
955 }
956 '\u{0344}' => {
957 // COMBINING GREEK DIALYTIKA TONOS
958 self.buffer
959 .push(CharacterAndClass::new('\u{0308}', CCC_ABOVE));
960 CharacterAndClass::new('\u{0301}', CCC_ABOVE)
961 }
962 '\u{0F73}' => {
963 // TIBETAN VOWEL SIGN II
964 self.buffer
965 .push(CharacterAndClass::new('\u{0F71}', ccc!(CCC129, 129)));
966 CharacterAndClass::new('\u{0F72}', ccc!(CCC130, 130))
967 }
968 '\u{0F75}' => {
969 // TIBETAN VOWEL SIGN UU
970 self.buffer
971 .push(CharacterAndClass::new('\u{0F71}', ccc!(CCC129, 129)));
972 CharacterAndClass::new('\u{0F74}', ccc!(CCC132, 132))
973 }
974 '\u{0F81}' => {
975 // TIBETAN VOWEL SIGN REVERSED II
976 self.buffer
977 .push(CharacterAndClass::new('\u{0F71}', ccc!(CCC129, 129)));
978 CharacterAndClass::new('\u{0F80}', ccc!(CCC130, 130))
979 }
980 '\u{FF9E}' => {
981 // HALFWIDTH KATAKANA VOICED SOUND MARK
982 CharacterAndClass::new('\u{3099}', ccc!(KanaVoicing, 8))
983 }
984 '\u{FF9F}' => {
985 // HALFWIDTH KATAKANA VOICED SOUND MARK
986 CharacterAndClass::new('\u{309A}', ccc!(KanaVoicing, 8))
987 }
988 _ => {
989 // GIGO case
990 debug_assert!(false);
991 CharacterAndClass::new_with_placeholder(REPLACEMENT_CHARACTER)
992 }
993 };
994 self.buffer.push(mapped);
995 }
996 }
997 // Slicing succeeds by construction; we've always ensured that `combining_start`
998 // is in permissible range.
999 #[expect(clippy::indexing_slicing)]
1000 sort_slice_by_ccc(&mut self.buffer[combining_start..], self.trie);
1001 }
1002}
1003
1004impl<I> Iterator for Decomposition<'_, I>
1005where
1006 I: Iterator<Item = char>,
1007{
1008 type Item = char;
1009
1010 fn next(&mut self) -> Option<char> {
1011 if let Some(ret) = self.buffer.get(self.buffer_pos).map(|c| c.character()) {
1012 self.buffer_pos += 1;
1013 if self.buffer_pos == self.buffer.len() {
1014 self.buffer.clear();
1015 self.buffer_pos = 0;
1016 }
1017 return Some(ret);
1018 }
1019 debug_assert_eq!(self.buffer_pos, 0);
1020 let c_and_trie_val = self.pending.take()?;
1021 Some(self.decomposing_next(c_and_trie_val))
1022 }
1023}
1024
1025/// An iterator adaptor that turns an `Iterator` over `char` into
1026/// a lazily-decomposed and then canonically composed `char` sequence.
1027#[derive(Debug)]
1028pub struct Composition<'data, I>
1029where
1030 I: Iterator<Item = char>,
1031{
1032 /// The decomposing part of the normalizer than operates before
1033 /// the canonical composition is performed on its output.
1034 decomposition: Decomposition<'data, I>,
1035 /// Non-Hangul canonical composition data.
1036 canonical_compositions: Char16Trie<'data>,
1037 /// To make `next()` yield in cases where there's a non-composing
1038 /// starter in the decomposition buffer, we put it here to let it
1039 /// wait for the next `next()` call (or a jump forward within the
1040 /// `next()` call).
1041 unprocessed_starter: Option<char>,
1042 /// The lowest character for which any one of the following does
1043 /// not hold:
1044 /// 1. Roundtrips via decomposition and recomposition.
1045 /// 2. Decomposition starts with a non-starter
1046 /// 3. Is not a backward-combining starter
1047 composition_passthrough_bound: u32,
1048}
1049
1050impl<'data, I> Composition<'data, I>
1051where
1052 I: Iterator<Item = char>,
1053{
1054 fn new(
1055 decomposition: Decomposition<'data, I>,
1056 canonical_compositions: Char16Trie<'data>,
1057 composition_passthrough_bound: u16,
1058 ) -> Self {
1059 Self {
1060 decomposition,
1061 canonical_compositions,
1062 unprocessed_starter: None,
1063 composition_passthrough_bound: u32::from(composition_passthrough_bound),
1064 }
1065 }
1066
1067 /// Performs canonical composition (including Hangul) on a pair of
1068 /// characters or returns `None` if these characters don't compose.
1069 /// Composition exclusions are taken into account.
1070 #[inline(always)]
1071 pub fn compose(&self, starter: char, second: char) -> Option<char> {
1072 compose(self.canonical_compositions.iter(), starter, second)
1073 }
1074
1075 /// Performs (non-Hangul) canonical composition on a pair of characters
1076 /// or returns `None` if these characters don't compose. Composition
1077 /// exclusions are taken into account.
1078 #[inline(always)]
1079 fn compose_non_hangul(&self, starter: char, second: char) -> Option<char> {
1080 compose_non_hangul(self.canonical_compositions.iter(), starter, second)
1081 }
1082}
1083
1084impl<I> Iterator for Composition<'_, I>
1085where
1086 I: Iterator<Item = char>,
1087{
1088 type Item = char;
1089
1090 #[inline]
1091 fn next(&mut self) -> Option<char> {
1092 let mut undecomposed_starter = CharacterAndTrieValue::new('\u{0}', 0); // The compiler can't figure out that this gets overwritten before use.
1093 if self.unprocessed_starter.is_none() {
1094 // The loop is only broken out of as goto forward
1095 #[expect(clippy::never_loop)]
1096 loop {
1097 if let Some((character, ccc)) = self
1098 .decomposition
1099 .buffer
1100 .get(self.decomposition.buffer_pos)
1101 .map(|c| c.character_and_ccc())
1102 {
1103 self.decomposition.buffer_pos += 1;
1104 if self.decomposition.buffer_pos == self.decomposition.buffer.len() {
1105 self.decomposition.buffer.clear();
1106 self.decomposition.buffer_pos = 0;
1107 }
1108 if ccc == CCC_NOT_REORDERED {
1109 // Previous decomposition contains a starter. This must
1110 // now become the `unprocessed_starter` for it to have
1111 // a chance to compose with the upcoming characters.
1112 //
1113 // E.g. parenthesized Hangul in NFKC comes through here,
1114 // but suitable composition exclusion could exercise this
1115 // in NFC.
1116 self.unprocessed_starter = Some(character);
1117 break; // We already have a starter, so skip taking one from `pending`.
1118 }
1119 return Some(character);
1120 }
1121 debug_assert_eq!(self.decomposition.buffer_pos, 0);
1122 undecomposed_starter = self.decomposition.pending.take()?;
1123 if u32::from(undecomposed_starter.character) < self.composition_passthrough_bound
1124 || undecomposed_starter.potential_passthrough()
1125 {
1126 // TODO(#2385): In the NFC case (moot for NFKC and UTS46), if the upcoming
1127 // character is not below `decomposition_passthrough_bound` but is
1128 // below `composition_passthrough_bound`, we read from the trie
1129 // unnecessarily.
1130 if let Some(upcoming) = self.decomposition.delegate_next_no_pending() {
1131 let cannot_combine_backwards = u32::from(upcoming.character)
1132 < self.composition_passthrough_bound
1133 || !upcoming.can_combine_backwards();
1134 self.decomposition.pending = Some(upcoming);
1135 if cannot_combine_backwards {
1136 // Fast-track succeeded!
1137 return Some(undecomposed_starter.character);
1138 }
1139 } else {
1140 // End of stream
1141 return Some(undecomposed_starter.character);
1142 }
1143 }
1144 break; // Not actually looping
1145 }
1146 }
1147 let mut starter = '\u{0}'; // The compiler can't figure out this gets overwritten before use.
1148
1149 // The point of having this boolean is to have only one call site to
1150 // `self.decomposition.decomposing_next`, which is hopefully beneficial for
1151 // code size under inlining.
1152 let mut attempt_composition = false;
1153 loop {
1154 if let Some(unprocessed) = self.unprocessed_starter.take() {
1155 debug_assert_eq!(undecomposed_starter, CharacterAndTrieValue::new('\u{0}', 0));
1156 debug_assert_eq!(starter, '\u{0}');
1157 starter = unprocessed;
1158 } else {
1159 debug_assert_eq!(self.decomposition.buffer_pos, 0);
1160 let next_starter = self.decomposition.decomposing_next(undecomposed_starter);
1161 if !attempt_composition {
1162 starter = next_starter;
1163 } else if let Some(composed) = self.compose(starter, next_starter) {
1164 starter = composed;
1165 } else {
1166 // This is our yield point. We'll pick this up above in the
1167 // next call to `next()`.
1168 self.unprocessed_starter = Some(next_starter);
1169 return Some(starter);
1170 }
1171 }
1172 // We first loop by index to avoid moving the contents of `buffer`, but
1173 // if there's a discontiguous match, we'll start modifying `buffer` instead.
1174 loop {
1175 let (character, ccc) = if let Some((character, ccc)) = self
1176 .decomposition
1177 .buffer
1178 .get(self.decomposition.buffer_pos)
1179 .map(|c| c.character_and_ccc())
1180 {
1181 (character, ccc)
1182 } else {
1183 self.decomposition.buffer.clear();
1184 self.decomposition.buffer_pos = 0;
1185 break;
1186 };
1187 if let Some(composed) = self.compose(starter, character) {
1188 starter = composed;
1189 self.decomposition.buffer_pos += 1;
1190 continue;
1191 }
1192 let mut most_recent_skipped_ccc = ccc;
1193 {
1194 let _ = self
1195 .decomposition
1196 .buffer
1197 .drain(0..self.decomposition.buffer_pos);
1198 }
1199 self.decomposition.buffer_pos = 0;
1200 if most_recent_skipped_ccc == CCC_NOT_REORDERED {
1201 // We failed to compose a starter. Discontiguous match not allowed.
1202 // We leave the starter in `buffer` for `next()` to find.
1203 return Some(starter);
1204 }
1205 let mut i = 1; // We have skipped one non-starter.
1206 while let Some((character, ccc)) = self
1207 .decomposition
1208 .buffer
1209 .get(i)
1210 .map(|c| c.character_and_ccc())
1211 {
1212 if ccc == CCC_NOT_REORDERED {
1213 // Discontiguous match not allowed.
1214 return Some(starter);
1215 }
1216 debug_assert!(ccc >= most_recent_skipped_ccc);
1217 if ccc != most_recent_skipped_ccc {
1218 // Using the non-Hangul version as a micro-optimization, since
1219 // we already rejected the case where `second` is a starter
1220 // above, and conjoining jamo are starters.
1221 if let Some(composed) = self.compose_non_hangul(starter, character) {
1222 self.decomposition.buffer.remove(i);
1223 starter = composed;
1224 continue;
1225 }
1226 }
1227 most_recent_skipped_ccc = ccc;
1228 i += 1;
1229 }
1230 break;
1231 }
1232
1233 debug_assert_eq!(self.decomposition.buffer_pos, 0);
1234
1235 if !self.decomposition.buffer.is_empty() {
1236 return Some(starter);
1237 }
1238 // Now we need to check if composition with an upcoming starter is possible.
1239 if let Some(pending) = self.decomposition.pending.take() {
1240 // We know that `pending_starter` decomposes to start with a starter.
1241 // Otherwise, it would have been moved to `self.decomposition.buffer`
1242 // by `self.decomposing_next()`. We do this set lookup here in order
1243 // to get an opportunity to go back to the fast track.
1244 // Note that this check has to happen _after_ checking that `pending`
1245 // holds a character, because this flag isn't defined to be meaningful
1246 // when `pending` isn't holding a character.
1247 if u32::from(pending.character) < self.composition_passthrough_bound
1248 || !pending.can_combine_backwards()
1249 {
1250 // Won't combine backwards anyway.
1251 self.decomposition.pending = Some(pending);
1252 return Some(starter);
1253 }
1254 // Consume what we peeked.
1255 undecomposed_starter = pending;
1256 // The following line is OK, because we're about to loop back
1257 // to `self.decomposition.decomposing_next(c);`, which will
1258 // restore the between-`next()`-calls invariant of `pending`
1259 // before this function returns.
1260 attempt_composition = true;
1261 continue;
1262 }
1263 // End of input
1264 return Some(starter);
1265 }
1266 }
1267}
1268
1269macro_rules! composing_normalize_to {
1270 ($(#[$meta:meta])*,
1271 $normalize_to:ident,
1272 $write:path,
1273 $slice:ty,
1274 $prolog:block,
1275 $always_valid_utf:literal,
1276 $as_slice:ident,
1277 $fast:block,
1278 $text:ident,
1279 $sink:ident,
1280 $composition:ident,
1281 $composition_passthrough_bound:ident,
1282 $undecomposed_starter:ident,
1283 $pending_slice:ident,
1284 $len_utf:ident,
1285 ) => {
1286 $(#[$meta])*
1287 pub fn $normalize_to<W: $write + ?Sized>(
1288 &self,
1289 $text: $slice,
1290 $sink: &mut W,
1291 ) -> core::fmt::Result {
1292 $prolog
1293 let mut $composition = self.normalize_iter($text.chars());
1294 debug_assert_eq!($composition.decomposition.ignorable_behavior, IgnorableBehavior::Unsupported);
1295 for cc in $composition.decomposition.buffer.drain(..) {
1296 $sink.write_char(cc.character())?;
1297 }
1298
1299 // Try to get the compiler to hoist the bound to a register.
1300 let $composition_passthrough_bound = $composition.composition_passthrough_bound;
1301 'outer: loop {
1302 debug_assert_eq!($composition.decomposition.buffer_pos, 0);
1303 let mut $undecomposed_starter =
1304 if let Some(pending) = $composition.decomposition.pending.take() {
1305 pending
1306 } else {
1307 return Ok(());
1308 };
1309 if u32::from($undecomposed_starter.character) < $composition_passthrough_bound ||
1310 $undecomposed_starter.potential_passthrough()
1311 {
1312 // We don't know if a `REPLACEMENT_CHARACTER` occurred in the slice or
1313 // was returned in response to an error by the iterator. Assume the
1314 // latter for correctness even though it pessimizes the former.
1315 if $always_valid_utf || $undecomposed_starter.character != REPLACEMENT_CHARACTER {
1316 let $pending_slice = &$text[$text.len() - $composition.decomposition.delegate.$as_slice().len() - $undecomposed_starter.character.$len_utf()..];
1317 // The `$fast` block must either:
1318 // 1. Return due to reaching EOF
1319 // 2. Leave a starter with its trie value in `$undecomposed_starter`
1320 // and, if there is still more input, leave the next character
1321 // and its trie value in `$composition.decomposition.pending`.
1322 $fast
1323 }
1324 }
1325 // Fast track above, full algorithm below
1326 let mut starter = $composition
1327 .decomposition
1328 .decomposing_next($undecomposed_starter);
1329 'bufferloop: loop {
1330 // We first loop by index to avoid moving the contents of `buffer`, but
1331 // if there's a discontiguous match, we'll start modifying `buffer` instead.
1332 loop {
1333 let (character, ccc) = if let Some((character, ccc)) = $composition
1334 .decomposition
1335 .buffer
1336 .get($composition.decomposition.buffer_pos)
1337 .map(|c| c.character_and_ccc())
1338 {
1339 (character, ccc)
1340 } else {
1341 $composition.decomposition.buffer.clear();
1342 $composition.decomposition.buffer_pos = 0;
1343 break;
1344 };
1345 if let Some(composed) = $composition.compose(starter, character) {
1346 starter = composed;
1347 $composition.decomposition.buffer_pos += 1;
1348 continue;
1349 }
1350 let mut most_recent_skipped_ccc = ccc;
1351 if most_recent_skipped_ccc == CCC_NOT_REORDERED {
1352 // We failed to compose a starter. Discontiguous match not allowed.
1353 // Write the current `starter` we've been composing, make the unmatched
1354 // starter in the buffer the new `starter` (we know it's been decomposed)
1355 // and process the rest of the buffer with that as the starter.
1356 $sink.write_char(starter)?;
1357 starter = character;
1358 $composition.decomposition.buffer_pos += 1;
1359 continue 'bufferloop;
1360 } else {
1361 {
1362 let _ = $composition
1363 .decomposition
1364 .buffer
1365 .drain(0..$composition.decomposition.buffer_pos);
1366 }
1367 $composition.decomposition.buffer_pos = 0;
1368 }
1369 let mut i = 1; // We have skipped one non-starter.
1370 while let Some((character, ccc)) = $composition
1371 .decomposition
1372 .buffer
1373 .get(i)
1374 .map(|c| c.character_and_ccc())
1375 {
1376 if ccc == CCC_NOT_REORDERED {
1377 // Discontiguous match not allowed.
1378 $sink.write_char(starter)?;
1379 for cc in $composition.decomposition.buffer.drain(..i) {
1380 $sink.write_char(cc.character())?;
1381 }
1382 starter = character;
1383 {
1384 let removed = $composition.decomposition.buffer.remove(0);
1385 debug_assert_eq!(starter, removed.character());
1386 }
1387 debug_assert_eq!($composition.decomposition.buffer_pos, 0);
1388 continue 'bufferloop;
1389 }
1390 debug_assert!(ccc >= most_recent_skipped_ccc);
1391 if ccc != most_recent_skipped_ccc {
1392 // Using the non-Hangul version as a micro-optimization, since
1393 // we already rejected the case where `second` is a starter
1394 // above, and conjoining jamo are starters.
1395 if let Some(composed) =
1396 $composition.compose_non_hangul(starter, character)
1397 {
1398 $composition.decomposition.buffer.remove(i);
1399 starter = composed;
1400 continue;
1401 }
1402 }
1403 most_recent_skipped_ccc = ccc;
1404 i += 1;
1405 }
1406 break;
1407 }
1408 debug_assert_eq!($composition.decomposition.buffer_pos, 0);
1409
1410 if !$composition.decomposition.buffer.is_empty() {
1411 $sink.write_char(starter)?;
1412 for cc in $composition.decomposition.buffer.drain(..) {
1413 $sink.write_char(cc.character())?;
1414 }
1415 // We had non-empty buffer, so can't compose with upcoming.
1416 continue 'outer;
1417 }
1418 // Now we need to check if composition with an upcoming starter is possible.
1419 if $composition.decomposition.pending.is_some() {
1420 // We know that `pending_starter` decomposes to start with a starter.
1421 // Otherwise, it would have been moved to `composition.decomposition.buffer`
1422 // by `composition.decomposing_next()`. We do this set lookup here in order
1423 // to get an opportunity to go back to the fast track.
1424 // Note that this check has to happen _after_ checking that `pending`
1425 // holds a character, because this flag isn't defined to be meaningful
1426 // when `pending` isn't holding a character.
1427 let pending = $composition.decomposition.pending.as_ref().unwrap();
1428 if u32::from(pending.character) < $composition.composition_passthrough_bound
1429 || !pending.can_combine_backwards()
1430 {
1431 // Won't combine backwards anyway.
1432 $sink.write_char(starter)?;
1433 continue 'outer;
1434 }
1435 let pending_starter = $composition.decomposition.pending.take().unwrap();
1436 let decomposed = $composition.decomposition.decomposing_next(pending_starter);
1437 if let Some(composed) = $composition.compose(starter, decomposed) {
1438 starter = composed;
1439 } else {
1440 $sink.write_char(starter)?;
1441 starter = decomposed;
1442 }
1443 continue 'bufferloop;
1444 }
1445 // End of input
1446 $sink.write_char(starter)?;
1447 return Ok(());
1448 } // 'bufferloop
1449 }
1450 }
1451 };
1452}
1453
1454macro_rules! decomposing_normalize_to {
1455 ($(#[$meta:meta])*,
1456 $normalize_to:ident,
1457 $write:path,
1458 $slice:ty,
1459 $prolog:block,
1460 $as_slice:ident,
1461 $fast:block,
1462 $text:ident,
1463 $sink:ident,
1464 $decomposition:ident,
1465 $decomposition_passthrough_bound:ident,
1466 $undecomposed_starter:ident,
1467 $pending_slice:ident,
1468 $outer:lifetime, // loop labels use lifetime tokens
1469 ) => {
1470 $(#[$meta])*
1471 pub fn $normalize_to<W: $write + ?Sized>(
1472 &self,
1473 $text: $slice,
1474 $sink: &mut W,
1475 ) -> core::fmt::Result {
1476 $prolog
1477
1478 let mut $decomposition = self.normalize_iter($text.chars());
1479 debug_assert_eq!($decomposition.ignorable_behavior, IgnorableBehavior::Unsupported);
1480
1481 // Try to get the compiler to hoist the bound to a register.
1482 let $decomposition_passthrough_bound = $decomposition.decomposition_passthrough_bound;
1483 $outer: loop {
1484 for cc in $decomposition.buffer.drain(..) {
1485 $sink.write_char(cc.character())?;
1486 }
1487 debug_assert_eq!($decomposition.buffer_pos, 0);
1488 let mut $undecomposed_starter = if let Some(pending) = $decomposition.pending.take() {
1489 pending
1490 } else {
1491 return Ok(());
1492 };
1493 if $undecomposed_starter.starter_and_decomposes_to_self() {
1494 // Don't bother including `undecomposed_starter` in a contiguous buffer
1495 // write: Just write it right away:
1496 $sink.write_char($undecomposed_starter.character)?;
1497
1498 let $pending_slice = $decomposition.delegate.$as_slice();
1499 $fast
1500 }
1501 let starter = $decomposition.decomposing_next($undecomposed_starter);
1502 $sink.write_char(starter)?;
1503 }
1504 }
1505 };
1506}
1507
1508macro_rules! normalizer_methods {
1509 () => {
1510 /// Normalize a string slice into a `Cow<'a, str>`.
1511 pub fn normalize<'a>(&self, text: &'a str) -> Cow<'a, str> {
1512 let (head, tail) = self.split_normalized(text);
1513 if tail.is_empty() {
1514 return Cow::Borrowed(head);
1515 }
1516 let mut ret = String::new();
1517 ret.reserve(text.len());
1518 ret.push_str(head);
1519 let _ = self.normalize_to(tail, &mut ret);
1520 Cow::Owned(ret)
1521 }
1522
1523 /// Split a string slice into maximum normalized prefix and unnormalized suffix
1524 /// such that the concatenation of the prefix and the normalization of the suffix
1525 /// is the normalization of the whole input.
1526 pub fn split_normalized<'a>(&self, text: &'a str) -> (&'a str, &'a str) {
1527 let up_to = self.is_normalized_up_to(text);
1528 text.split_at_checked(up_to).unwrap_or_else(|| {
1529 // Internal bug, not even GIGO, never supposed to happen
1530 debug_assert!(false);
1531 ("", text)
1532 })
1533 }
1534
1535 /// Return the index a string slice is normalized up to.
1536 fn is_normalized_up_to(&self, text: &str) -> usize {
1537 let mut sink = IsNormalizedSinkStr::new(text);
1538 let _ = self.normalize_to(text, &mut sink);
1539 text.len() - sink.remaining_len()
1540 }
1541
1542 /// Check whether a string slice is normalized.
1543 pub fn is_normalized(&self, text: &str) -> bool {
1544 self.is_normalized_up_to(text) == text.len()
1545 }
1546
1547 /// Normalize a slice of potentially-invalid UTF-16 into a `Cow<'a, [u16]>`.
1548 ///
1549 /// Unpaired surrogates are mapped to the REPLACEMENT CHARACTER
1550 /// before normalizing.
1551 ///
1552 /// ✨ *Enabled with the `utf16_iter` Cargo feature.*
1553 #[cfg(feature = "utf16_iter")]
1554 pub fn normalize_utf16<'a>(&self, text: &'a [u16]) -> Cow<'a, [u16]> {
1555 let (head, tail) = self.split_normalized_utf16(text);
1556 if tail.is_empty() {
1557 return Cow::Borrowed(head);
1558 }
1559 let mut ret = alloc::vec::Vec::with_capacity(text.len());
1560 ret.extend_from_slice(head);
1561 let _ = self.normalize_utf16_to(tail, &mut ret);
1562 Cow::Owned(ret)
1563 }
1564
1565 /// Split a slice of potentially-invalid UTF-16 into maximum normalized (and valid)
1566 /// prefix and unnormalized suffix such that the concatenation of the prefix and the
1567 /// normalization of the suffix is the normalization of the whole input.
1568 ///
1569 /// ✨ *Enabled with the `utf16_iter` Cargo feature.*
1570 #[cfg(feature = "utf16_iter")]
1571 pub fn split_normalized_utf16<'a>(&self, text: &'a [u16]) -> (&'a [u16], &'a [u16]) {
1572 let up_to = self.is_normalized_utf16_up_to(text);
1573 text.split_at_checked(up_to).unwrap_or_else(|| {
1574 // Internal bug, not even GIGO, never supposed to happen
1575 debug_assert!(false);
1576 (&[], text)
1577 })
1578 }
1579
1580 /// Return the index a slice of potentially-invalid UTF-16 is normalized up to.
1581 ///
1582 /// ✨ *Enabled with the `utf16_iter` Cargo feature.*
1583 #[cfg(feature = "utf16_iter")]
1584 fn is_normalized_utf16_up_to(&self, text: &[u16]) -> usize {
1585 let mut sink = IsNormalizedSinkUtf16::new(text);
1586 let _ = self.normalize_utf16_to(text, &mut sink);
1587 text.len() - sink.remaining_len()
1588 }
1589
1590 /// Checks whether a slice of potentially-invalid UTF-16 is normalized.
1591 ///
1592 /// Unpaired surrogates are treated as the REPLACEMENT CHARACTER.
1593 ///
1594 /// ✨ *Enabled with the `utf16_iter` Cargo feature.*
1595 #[cfg(feature = "utf16_iter")]
1596 pub fn is_normalized_utf16(&self, text: &[u16]) -> bool {
1597 self.is_normalized_utf16_up_to(text) == text.len()
1598 }
1599
1600 /// Normalize a slice of potentially-invalid UTF-8 into a `Cow<'a, str>`.
1601 ///
1602 /// Ill-formed byte sequences are mapped to the REPLACEMENT CHARACTER
1603 /// according to the WHATWG Encoding Standard.
1604 ///
1605 /// ✨ *Enabled with the `utf8_iter` Cargo feature.*
1606 #[cfg(feature = "utf8_iter")]
1607 pub fn normalize_utf8<'a>(&self, text: &'a [u8]) -> Cow<'a, str> {
1608 let (head, tail) = self.split_normalized_utf8(text);
1609 if tail.is_empty() {
1610 return Cow::Borrowed(head);
1611 }
1612 let mut ret = String::new();
1613 ret.reserve(text.len());
1614 ret.push_str(head);
1615 let _ = self.normalize_utf8_to(tail, &mut ret);
1616 Cow::Owned(ret)
1617 }
1618
1619 /// Split a slice of potentially-invalid UTF-8 into maximum normalized (and valid)
1620 /// prefix and unnormalized suffix such that the concatenation of the prefix and the
1621 /// normalization of the suffix is the normalization of the whole input.
1622 ///
1623 /// ✨ *Enabled with the `utf8_iter` Cargo feature.*
1624 #[cfg(feature = "utf8_iter")]
1625 pub fn split_normalized_utf8<'a>(&self, text: &'a [u8]) -> (&'a str, &'a [u8]) {
1626 let up_to = self.is_normalized_utf8_up_to(text);
1627 let (head, tail) = text.split_at_checked(up_to).unwrap_or_else(|| {
1628 // Internal bug, not even GIGO, never supposed to happen
1629 debug_assert!(false);
1630 (&[], text)
1631 });
1632 // SAFETY: The normalization check also checks for
1633 // UTF-8 well-formedness.
1634 (unsafe { core::str::from_utf8_unchecked(head) }, tail)
1635 }
1636
1637 /// Return the index a slice of potentially-invalid UTF-8 is normalized up to
1638 ///
1639 /// ✨ *Enabled with the `utf8_iter` Cargo feature.*
1640 #[cfg(feature = "utf8_iter")]
1641 fn is_normalized_utf8_up_to(&self, text: &[u8]) -> usize {
1642 let mut sink = IsNormalizedSinkUtf8::new(text);
1643 let _ = self.normalize_utf8_to(text, &mut sink);
1644 text.len() - sink.remaining_len()
1645 }
1646
1647 /// Check if a slice of potentially-invalid UTF-8 is normalized.
1648 ///
1649 /// Ill-formed byte sequences are mapped to the REPLACEMENT CHARACTER
1650 /// according to the WHATWG Encoding Standard before checking.
1651 ///
1652 /// ✨ *Enabled with the `utf8_iter` Cargo feature.*
1653 #[cfg(feature = "utf8_iter")]
1654 pub fn is_normalized_utf8(&self, text: &[u8]) -> bool {
1655 self.is_normalized_utf8_up_to(text) == text.len()
1656 }
1657 };
1658}
1659
1660/// Borrowed version of a normalizer for performing decomposing normalization.
1661#[derive(Debug)]
1662pub struct DecomposingNormalizerBorrowed<'a> {
1663 decompositions: &'a DecompositionData<'a>,
1664 tables: &'a DecompositionTables<'a>,
1665 supplementary_tables: Option<&'a DecompositionTables<'a>>,
1666 decomposition_passthrough_bound: u8, // never above 0xC0
1667 composition_passthrough_bound: u16, // never above 0x0300
1668}
1669
1670impl DecomposingNormalizerBorrowed<'static> {
1671 /// Cheaply converts a [`DecomposingNormalizerBorrowed<'static>`] into a [`DecomposingNormalizer`].
1672 ///
1673 /// Note: Due to branching and indirection, using [`DecomposingNormalizer`] might inhibit some
1674 /// compile-time optimizations that are possible with [`DecomposingNormalizerBorrowed`].
1675 pub const fn static_to_owned(self) -> DecomposingNormalizer {
1676 DecomposingNormalizer {
1677 decompositions: DataPayload::from_static_ref(self.decompositions),
1678 tables: DataPayload::from_static_ref(self.tables),
1679 supplementary_tables: if let Some(s) = self.supplementary_tables {
1680 // `map` not available in const context
1681 Some(DataPayload::from_static_ref(s))
1682 } else {
1683 None
1684 },
1685 decomposition_passthrough_bound: self.decomposition_passthrough_bound,
1686 composition_passthrough_bound: self.composition_passthrough_bound,
1687 }
1688 }
1689
1690 /// NFD constructor using compiled data.
1691 ///
1692 /// ✨ *Enabled with the `compiled_data` Cargo feature.*
1693 ///
1694 /// [📚 Help choosing a constructor](icu_provider::constructors)
1695 #[cfg(feature = "compiled_data")]
1696 pub const fn new_nfd() -> Self {
1697 const _: () = assert!(
1698 provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1
1699 .scalars16
1700 .const_len()
1701 + provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1
1702 .scalars24
1703 .const_len()
1704 <= 0xFFF,
1705 "future extension"
1706 );
1707
1708 DecomposingNormalizerBorrowed {
1709 decompositions: provider::Baked::SINGLETON_NORMALIZER_NFD_DATA_V1,
1710 tables: provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1,
1711 supplementary_tables: None,
1712 decomposition_passthrough_bound: 0xC0,
1713 composition_passthrough_bound: 0x0300,
1714 }
1715 }
1716
1717 /// NFKD constructor using compiled data.
1718 ///
1719 /// ✨ *Enabled with the `compiled_data` Cargo feature.*
1720 ///
1721 /// [📚 Help choosing a constructor](icu_provider::constructors)
1722 #[cfg(feature = "compiled_data")]
1723 pub const fn new_nfkd() -> Self {
1724 const _: () = assert!(
1725 provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1
1726 .scalars16
1727 .const_len()
1728 + provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1
1729 .scalars24
1730 .const_len()
1731 + provider::Baked::SINGLETON_NORMALIZER_NFKD_TABLES_V1
1732 .scalars16
1733 .const_len()
1734 + provider::Baked::SINGLETON_NORMALIZER_NFKD_TABLES_V1
1735 .scalars24
1736 .const_len()
1737 <= 0xFFF,
1738 "future extension"
1739 );
1740
1741 const _: () = assert!(
1742 provider::Baked::SINGLETON_NORMALIZER_NFKD_DATA_V1.passthrough_cap <= 0x0300,
1743 "invalid"
1744 );
1745
1746 let decomposition_capped =
1747 if provider::Baked::SINGLETON_NORMALIZER_NFKD_DATA_V1.passthrough_cap < 0xC0 {
1748 provider::Baked::SINGLETON_NORMALIZER_NFKD_DATA_V1.passthrough_cap
1749 } else {
1750 0xC0
1751 };
1752 let composition_capped =
1753 if provider::Baked::SINGLETON_NORMALIZER_NFKD_DATA_V1.passthrough_cap < 0x0300 {
1754 provider::Baked::SINGLETON_NORMALIZER_NFKD_DATA_V1.passthrough_cap
1755 } else {
1756 0x0300
1757 };
1758
1759 DecomposingNormalizerBorrowed {
1760 decompositions: provider::Baked::SINGLETON_NORMALIZER_NFKD_DATA_V1,
1761 tables: provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1,
1762 supplementary_tables: Some(provider::Baked::SINGLETON_NORMALIZER_NFKD_TABLES_V1),
1763 decomposition_passthrough_bound: decomposition_capped as u8,
1764 composition_passthrough_bound: composition_capped,
1765 }
1766 }
1767
1768 #[cfg(feature = "compiled_data")]
1769 pub(crate) const fn new_uts46_decomposed() -> Self {
1770 const _: () = assert!(
1771 provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1
1772 .scalars16
1773 .const_len()
1774 + provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1
1775 .scalars24
1776 .const_len()
1777 + provider::Baked::SINGLETON_NORMALIZER_NFKD_TABLES_V1
1778 .scalars16
1779 .const_len()
1780 + provider::Baked::SINGLETON_NORMALIZER_NFKD_TABLES_V1
1781 .scalars24
1782 .const_len()
1783 <= 0xFFF,
1784 "future extension"
1785 );
1786
1787 const _: () = assert!(
1788 provider::Baked::SINGLETON_NORMALIZER_UTS46_DATA_V1.passthrough_cap <= 0x0300,
1789 "invalid"
1790 );
1791
1792 let decomposition_capped =
1793 if provider::Baked::SINGLETON_NORMALIZER_UTS46_DATA_V1.passthrough_cap < 0xC0 {
1794 provider::Baked::SINGLETON_NORMALIZER_UTS46_DATA_V1.passthrough_cap
1795 } else {
1796 0xC0
1797 };
1798 let composition_capped =
1799 if provider::Baked::SINGLETON_NORMALIZER_UTS46_DATA_V1.passthrough_cap < 0x0300 {
1800 provider::Baked::SINGLETON_NORMALIZER_UTS46_DATA_V1.passthrough_cap
1801 } else {
1802 0x0300
1803 };
1804
1805 DecomposingNormalizerBorrowed {
1806 decompositions: provider::Baked::SINGLETON_NORMALIZER_UTS46_DATA_V1,
1807 tables: provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1,
1808 supplementary_tables: Some(provider::Baked::SINGLETON_NORMALIZER_NFKD_TABLES_V1),
1809 decomposition_passthrough_bound: decomposition_capped as u8,
1810 composition_passthrough_bound: composition_capped,
1811 }
1812 }
1813}
1814
1815impl<'data> DecomposingNormalizerBorrowed<'data> {
1816 /// NFD constructor using already-loaded data.
1817 ///
1818 /// This constructor is intended for use by collations.
1819 ///
1820 /// [📚 Help choosing a constructor](icu_provider::constructors)
1821 #[doc(hidden)]
1822 pub fn new_with_data(
1823 decompositions: &'data DecompositionData<'data>,
1824 tables: &'data DecompositionTables<'data>,
1825 ) -> Self {
1826 Self {
1827 decompositions,
1828 tables,
1829 supplementary_tables: None,
1830 decomposition_passthrough_bound: 0xC0,
1831 composition_passthrough_bound: 0x0300,
1832 }
1833 }
1834
1835 /// Wraps a delegate iterator into a decomposing iterator
1836 /// adapter by using the data already held by this normalizer.
1837 pub fn normalize_iter<I: Iterator<Item = char>>(&self, iter: I) -> Decomposition<'data, I> {
1838 Decomposition::new_with_supplements(
1839 iter,
1840 self.decompositions,
1841 self.tables,
1842 self.supplementary_tables,
1843 self.decomposition_passthrough_bound,
1844 IgnorableBehavior::Unsupported,
1845 )
1846 }
1847
1848 normalizer_methods!();
1849
1850 decomposing_normalize_to!(
1851 /// Normalize a string slice into a `Write` sink.
1852 ,
1853 normalize_to,
1854 core::fmt::Write,
1855 &str,
1856 {
1857 },
1858 as_str,
1859 {
1860 let decomposition_passthrough_byte_bound = if decomposition_passthrough_bound == 0xC0 {
1861 0xC3u8
1862 } else {
1863 decomposition_passthrough_bound.min(0x80) as u8
1864 };
1865 // The attribute belongs on an inner statement, but Rust doesn't allow it there.
1866 #[expect(clippy::unwrap_used)]
1867 'fast: loop {
1868 let mut code_unit_iter = decomposition.delegate.as_str().as_bytes().iter();
1869 'fastest: loop {
1870 if let Some(&upcoming_byte) = code_unit_iter.next() {
1871 if upcoming_byte < decomposition_passthrough_byte_bound {
1872 // Fast-track succeeded!
1873 continue 'fastest;
1874 }
1875 // This deliberately isn't panic-free, since the code pattern
1876 // that was OK for the composing counterpart regressed
1877 // English and French performance if done here, too.
1878 decomposition.delegate = pending_slice[pending_slice.len() - code_unit_iter.as_slice().len() - 1..].chars();
1879 break 'fastest;
1880 }
1881 // End of stream
1882 sink.write_str(pending_slice)?;
1883 return Ok(());
1884 }
1885
1886 // `unwrap()` OK, because the slice is valid UTF-8 and we know there
1887 // is an upcoming byte.
1888 let upcoming = decomposition.delegate.next().unwrap();
1889 let upcoming_with_trie_value = decomposition.attach_trie_value(upcoming);
1890 if upcoming_with_trie_value.starter_and_decomposes_to_self() {
1891 continue 'fast;
1892 }
1893 let consumed_so_far_slice = &pending_slice[..pending_slice.len()
1894 - decomposition.delegate.as_str().len()
1895 - upcoming.len_utf8()];
1896 sink.write_str(consumed_so_far_slice)?;
1897
1898 // Now let's figure out if we got a starter or a non-starter.
1899 if decomposition_starts_with_non_starter(
1900 upcoming_with_trie_value.trie_val,
1901 ) {
1902 // Let this trie value to be reprocessed in case it is
1903 // one of the rare decomposing ones.
1904 decomposition.pending = Some(upcoming_with_trie_value);
1905 decomposition.gather_and_sort_combining(0);
1906 continue 'outer;
1907 }
1908 undecomposed_starter = upcoming_with_trie_value;
1909 debug_assert!(decomposition.pending.is_none());
1910 break 'fast;
1911 }
1912 },
1913 text,
1914 sink,
1915 decomposition,
1916 decomposition_passthrough_bound,
1917 undecomposed_starter,
1918 pending_slice,
1919 'outer,
1920 );
1921
1922 decomposing_normalize_to!(
1923 /// Normalize a slice of potentially-invalid UTF-8 into a `Write` sink.
1924 ///
1925 /// Ill-formed byte sequences are mapped to the REPLACEMENT CHARACTER
1926 /// according to the WHATWG Encoding Standard.
1927 ///
1928 /// ✨ *Enabled with the `utf8_iter` Cargo feature.*
1929 #[cfg(feature = "utf8_iter")]
1930 ,
1931 normalize_utf8_to,
1932 core::fmt::Write,
1933 &[u8],
1934 {
1935 },
1936 as_slice,
1937 {
1938 let decomposition_passthrough_byte_bound = decomposition_passthrough_bound.min(0x80) as u8;
1939 'fast: loop {
1940 let mut code_unit_iter = decomposition.delegate.as_slice().iter();
1941 'fastest: loop {
1942 if let Some(&upcoming_byte) = code_unit_iter.next() {
1943 if upcoming_byte < decomposition_passthrough_byte_bound {
1944 // Fast-track succeeded!
1945 continue 'fastest;
1946 }
1947 break 'fastest;
1948 }
1949 // End of stream
1950 sink.write_str(unsafe { core::str::from_utf8_unchecked(pending_slice) })?;
1951 return Ok(());
1952 }
1953 #[expect(clippy::indexing_slicing)]
1954 {decomposition.delegate = pending_slice[pending_slice.len() - code_unit_iter.as_slice().len() - 1..].chars();}
1955
1956 // `unwrap()` OK, because the slice is valid UTF-8 and we know there
1957 // is an upcoming byte.
1958 #[expect(clippy::unwrap_used)]
1959 let upcoming = decomposition.delegate.next().unwrap();
1960 let upcoming_with_trie_value = decomposition.attach_trie_value(upcoming);
1961 if upcoming_with_trie_value.starter_and_decomposes_to_self_except_replacement() {
1962 // Note: The trie value of the REPLACEMENT CHARACTER is
1963 // intentionally formatted to fail the
1964 // `starter_and_decomposes_to_self` test even though it
1965 // really is a starter that decomposes to self. This
1966 // Allows moving the branch on REPLACEMENT CHARACTER
1967 // below this `continue`.
1968 continue 'fast;
1969 }
1970
1971 // TODO: Annotate as unlikely.
1972 if upcoming == REPLACEMENT_CHARACTER {
1973 // We might have an error, so fall out of the fast path.
1974
1975 // Since the U+FFFD might signify an error, we can't
1976 // assume `upcoming.len_utf8()` for the backoff length.
1977 #[expect(clippy::indexing_slicing)]
1978 let mut consumed_so_far = pending_slice[..pending_slice.len() - decomposition.delegate.as_slice().len()].chars();
1979 let back = consumed_so_far.next_back();
1980 debug_assert_eq!(back, Some(REPLACEMENT_CHARACTER));
1981 let consumed_so_far_slice = consumed_so_far.as_slice();
1982 sink.write_str(unsafe { core::str::from_utf8_unchecked(consumed_so_far_slice) } )?;
1983
1984 // We could call `gather_and_sort_combining` here and
1985 // `continue 'outer`, but this should be better for code
1986 // size.
1987 undecomposed_starter = upcoming_with_trie_value;
1988 debug_assert!(decomposition.pending.is_none());
1989 break 'fast;
1990 }
1991
1992 #[expect(clippy::indexing_slicing)]
1993 let consumed_so_far_slice = &pending_slice[..pending_slice.len()
1994 - decomposition.delegate.as_slice().len()
1995 - upcoming.len_utf8()];
1996 sink.write_str(unsafe { core::str::from_utf8_unchecked(consumed_so_far_slice) } )?;
1997
1998 // Now let's figure out if we got a starter or a non-starter.
1999 if decomposition_starts_with_non_starter(
2000 upcoming_with_trie_value.trie_val,
2001 ) {
2002 // Let this trie value to be reprocessed in case it is
2003 // one of the rare decomposing ones.
2004 decomposition.pending = Some(upcoming_with_trie_value);
2005 decomposition.gather_and_sort_combining(0);
2006 continue 'outer;
2007 }
2008 undecomposed_starter = upcoming_with_trie_value;
2009 debug_assert!(decomposition.pending.is_none());
2010 break 'fast;
2011 }
2012 },
2013 text,
2014 sink,
2015 decomposition,
2016 decomposition_passthrough_bound,
2017 undecomposed_starter,
2018 pending_slice,
2019 'outer,
2020 );
2021
2022 decomposing_normalize_to!(
2023 /// Normalize a slice of potentially-invalid UTF-16 into a `Write16` sink.
2024 ///
2025 /// Unpaired surrogates are mapped to the REPLACEMENT CHARACTER
2026 /// before normalizing.
2027 ///
2028 /// ✨ *Enabled with the `utf16_iter` Cargo feature.*
2029 #[cfg(feature = "utf16_iter")]
2030 ,
2031 normalize_utf16_to,
2032 write16::Write16,
2033 &[u16],
2034 {
2035 sink.size_hint(text.len())?;
2036 },
2037 as_slice,
2038 {
2039 // This loop is only broken out of as goto forward and only as release-build recovery from
2040 // detecting an internal bug without panic. (In debug builds, internal bugs panic instead.)
2041 #[expect(clippy::never_loop)]
2042 'fastwrap: loop {
2043 // Commented out `code_unit_iter` and used `ptr` and `end` to
2044 // work around https://github.com/rust-lang/rust/issues/144684 .
2045 //
2046 // let mut code_unit_iter = decomposition.delegate.as_slice().iter();
2047 let delegate_as_slice = decomposition.delegate.as_slice();
2048 let mut ptr: *const u16 = delegate_as_slice.as_ptr();
2049 // SAFETY: materializing a pointer immediately past the end of an
2050 // allocation is OK.
2051 let end: *const u16 = unsafe { ptr.add(delegate_as_slice.len()) };
2052 'fast: loop {
2053 // if let Some(&upcoming_code_unit) = code_unit_iter.next() {
2054 if ptr != end {
2055 // SAFETY: We just checked that `ptr` has not reached `end`.
2056 // `ptr` always advances by one, and we always have a check
2057 // per advancement.
2058 let upcoming_code_unit = unsafe { *ptr };
2059 // SAFETY: Since `ptr` hadn't reached `end`, yet, advancing
2060 // by one points to the same allocation or to immediately
2061 // after, which is OK.
2062 ptr = unsafe { ptr.add(1) };
2063
2064 let mut upcoming32 = u32::from(upcoming_code_unit);
2065 // The performance of what logically is supposed to be this
2066 // branch is _incredibly_ brittle and what LLVM ends up doing
2067 // that affects the performance of what's logically about this
2068 // decision can swing to double/halve the throughput for Basic
2069 // Latin in ways that are completely unintuitive. Basically _any_
2070 // change to _any_ code that participates in how LLVM sees the
2071 // code around here can make the perf fall over. In seems that
2072 // manually annotating this branch as likely has worse effects
2073 // on non-Basic-Latin input that the case where LLVM just happens to
2074 // do the right thing.
2075 //
2076 // What happens with this branch may depend on what sink type
2077 // this code is monomorphized over.
2078 //
2079 // What a terrible sink of developer time!
2080 if upcoming32 < decomposition_passthrough_bound {
2081 continue 'fast;
2082 }
2083 // We might be doing a trie lookup by surrogate. Surrogates get
2084 // a decomposition to U+FFFD.
2085 let mut trie_value = decomposition.trie.get16(upcoming_code_unit);
2086 if starter_and_decomposes_to_self_impl(trie_value) {
2087 continue 'fast;
2088 }
2089 // We might now be looking at a surrogate.
2090 // The loop is only broken out of as goto forward
2091 #[expect(clippy::never_loop)]
2092 'surrogateloop: loop {
2093 // LLVM's optimizations are incredibly brittle for the code _above_,
2094 // and using `likely` _below_ without using it _above_ helps!
2095 // What a massive sink of developer time!
2096 // Seriously, the effect of these annotations is massively
2097 // unintuitive. Measure everything!
2098 // Notably, the `if likely(...)` formulation optimizes differently
2099 // than just putting `cold_path()` on the `else` path!
2100 let surrogate_base = upcoming32.wrapping_sub(0xD800);
2101 if likely(surrogate_base > (0xDFFF - 0xD800)) {
2102 // Not surrogate
2103 break 'surrogateloop;
2104 }
2105 if likely(surrogate_base <= (0xDBFF - 0xD800)) {
2106 // let iter_backup = code_unit_iter.clone();
2107 // if let Some(&low) = code_unit_iter.next() {
2108 if ptr != end {
2109 // SAFETY: We just checked that `ptr` has not reached `end`.
2110 // `ptr` always advances by one, and we always have a check
2111 // per advancement.
2112 let low = unsafe { *ptr };
2113 if likely(in_inclusive_range16(low, 0xDC00, 0xDFFF)) {
2114 // SAFETY: Since `ptr` hadn't reached `end`, yet, advancing
2115 // by one points to the same allocation or to immediately
2116 // after, which is OK.
2117 ptr = unsafe { ptr.add(1) };
2118
2119 upcoming32 = (upcoming32 << 10) + u32::from(low)
2120 - (((0xD800u32 << 10) - 0x10000u32) + 0xDC00u32);
2121 // Successfully-paired surrogate. Read from the trie again.
2122 trie_value = {
2123 // Semantically, this bit of conditional compilation makes no sense.
2124 // The purpose is to keep LLVM seeing the untyped trie case the way
2125 // it did before so as not to regress the performance of the untyped
2126 // case due to unintuitive optimizer effects. If you care about the
2127 // perf of the untyped trie case and have better ideas, please try
2128 // something better.
2129 #[cfg(not(icu4x_unstable_fast_trie_only))]
2130 {decomposition.trie.get32(upcoming32)}
2131 #[cfg(icu4x_unstable_fast_trie_only)]
2132 {decomposition.trie.get32_supplementary(upcoming32)}
2133 };
2134 if likely(starter_and_decomposes_to_self_impl(trie_value)) {
2135 continue 'fast;
2136 }
2137 break 'surrogateloop;
2138 // } else {
2139 // code_unit_iter = iter_backup;
2140 }
2141 }
2142 }
2143 // unpaired surrogate
2144 upcoming32 = 0xFFFD; // Safe value for `char::from_u32_unchecked` and matches later potential error check.
2145 // trie_value already holds a decomposition to U+FFFD.
2146 break 'surrogateloop;
2147 }
2148
2149 let upcoming = unsafe { char::from_u32_unchecked(upcoming32) };
2150 let upcoming_with_trie_value = CharacterAndTrieValue::new(upcoming, trie_value);
2151
2152
2153 let Some(consumed_so_far_slice) = pending_slice.get(..pending_slice.len() -
2154 // code_unit_iter.as_slice().len()
2155 // SAFETY: `ptr` and `end` have been derived from the same allocation
2156 // and `ptr` is never greater than `end`.
2157 unsafe { end.offset_from(ptr) as usize }
2158 - upcoming.len_utf16()) else {
2159 // If we ever come here, it's a bug, but let's avoid panic code paths in release builds.
2160 debug_assert!(false);
2161 // Throw away the results of the fast path.
2162 break 'fastwrap;
2163 };
2164 sink.write_slice(consumed_so_far_slice)?;
2165
2166 if decomposition_starts_with_non_starter(
2167 upcoming_with_trie_value.trie_val,
2168 ) {
2169 // Sync with main iterator
2170 // decomposition.delegate = code_unit_iter.as_slice().chars();
2171 // SAFETY: `ptr` and `end` have been derived from the same allocation
2172 // and `ptr` is never greater than `end`.
2173 decomposition.delegate = unsafe { core::slice::from_raw_parts(ptr, end.offset_from(ptr) as usize) }.chars();
2174 // Let this trie value to be reprocessed in case it is
2175 // one of the rare decomposing ones.
2176 decomposition.pending = Some(upcoming_with_trie_value);
2177 decomposition.gather_and_sort_combining(0);
2178 continue 'outer;
2179 }
2180 undecomposed_starter = upcoming_with_trie_value;
2181 debug_assert!(decomposition.pending.is_none());
2182 break 'fast;
2183 }
2184 // End of stream
2185 sink.write_slice(pending_slice)?;
2186 return Ok(());
2187 }
2188 // Sync the main iterator
2189 // decomposition.delegate = code_unit_iter.as_slice().chars();
2190 // SAFETY: `ptr` and `end` have been derived from the same allocation
2191 // and `ptr` is never greater than `end`.
2192 decomposition.delegate = unsafe { core::slice::from_raw_parts(ptr, end.offset_from(ptr) as usize) }.chars();
2193 break 'fastwrap;
2194 }
2195 },
2196 text,
2197 sink,
2198 decomposition,
2199 decomposition_passthrough_bound,
2200 undecomposed_starter,
2201 pending_slice,
2202 'outer,
2203 );
2204}
2205
2206/// A normalizer for performing decomposing normalization.
2207#[derive(Debug)]
2208pub struct DecomposingNormalizer {
2209 decompositions: DataPayload<NormalizerNfdDataV1>,
2210 tables: DataPayload<NormalizerNfdTablesV1>,
2211 supplementary_tables: Option<DataPayload<NormalizerNfkdTablesV1>>,
2212 decomposition_passthrough_bound: u8, // never above 0xC0
2213 composition_passthrough_bound: u16, // never above 0x0300
2214}
2215
2216impl DecomposingNormalizer {
2217 /// Constructs a borrowed version of this type for more efficient querying.
2218 pub fn as_borrowed(&self) -> DecomposingNormalizerBorrowed<'_> {
2219 DecomposingNormalizerBorrowed {
2220 decompositions: self.decompositions.get(),
2221 tables: self.tables.get(),
2222 supplementary_tables: self.supplementary_tables.as_ref().map(|s| s.get()),
2223 decomposition_passthrough_bound: self.decomposition_passthrough_bound,
2224 composition_passthrough_bound: self.composition_passthrough_bound,
2225 }
2226 }
2227
2228 /// NFD constructor using compiled data.
2229 ///
2230 /// ✨ *Enabled with the `compiled_data` Cargo feature.*
2231 ///
2232 /// [📚 Help choosing a constructor](icu_provider::constructors)
2233 #[cfg(feature = "compiled_data")]
2234 pub const fn new_nfd() -> DecomposingNormalizerBorrowed<'static> {
2235 DecomposingNormalizerBorrowed::new_nfd()
2236 }
2237
2238 icu_provider::gen_buffer_data_constructors!(
2239 () -> error: DataError,
2240 functions: [
2241 new_nfd: skip,
2242 try_new_nfd_with_buffer_provider,
2243 try_new_nfd_unstable,
2244 Self,
2245 ]
2246 );
2247
2248 #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_nfd)]
2249 pub fn try_new_nfd_unstable<D>(provider: &D) -> Result<Self, DataError>
2250 where
2251 D: DataProvider<NormalizerNfdDataV1> + DataProvider<NormalizerNfdTablesV1> + ?Sized,
2252 {
2253 let decompositions: DataPayload<NormalizerNfdDataV1> =
2254 provider.load(Default::default())?.payload;
2255 let tables: DataPayload<NormalizerNfdTablesV1> = provider.load(Default::default())?.payload;
2256
2257 if tables.get().scalars16.len() + tables.get().scalars24.len() > 0xFFF {
2258 // The data is from a future where there exists a normalization flavor whose
2259 // complex decompositions take more than 0xFFF but fewer than 0x1FFF code points
2260 // of space. If a good use case from such a decomposition flavor arises, we can
2261 // dynamically change the bit masks so that the length mask becomes 0x1FFF instead
2262 // of 0xFFF and the all-non-starters mask becomes 0 instead of 0x1000. However,
2263 // since for now the masks are hard-coded, error out.
2264 return Err(
2265 DataError::custom("future extension").with_marker(NormalizerNfdTablesV1::INFO)
2266 );
2267 }
2268
2269 let cap = decompositions.get().passthrough_cap;
2270 if cap > 0x0300 {
2271 return Err(DataError::custom("invalid").with_marker(NormalizerNfdDataV1::INFO));
2272 }
2273 let decomposition_capped = cap.min(0xC0);
2274 let composition_capped = cap.min(0x0300);
2275
2276 Ok(DecomposingNormalizer {
2277 decompositions,
2278 tables,
2279 supplementary_tables: None,
2280 decomposition_passthrough_bound: decomposition_capped as u8,
2281 composition_passthrough_bound: composition_capped,
2282 })
2283 }
2284
2285 icu_provider::gen_buffer_data_constructors!(
2286 () -> error: DataError,
2287 functions: [
2288 new_nfkd: skip,
2289 try_new_nfkd_with_buffer_provider,
2290 try_new_nfkd_unstable,
2291 Self,
2292 ]
2293 );
2294
2295 /// NFKD constructor using compiled data.
2296 ///
2297 /// ✨ *Enabled with the `compiled_data` Cargo feature.*
2298 ///
2299 /// [📚 Help choosing a constructor](icu_provider::constructors)
2300 #[cfg(feature = "compiled_data")]
2301 pub const fn new_nfkd() -> DecomposingNormalizerBorrowed<'static> {
2302 DecomposingNormalizerBorrowed::new_nfkd()
2303 }
2304
2305 #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_nfkd)]
2306 pub fn try_new_nfkd_unstable<D>(provider: &D) -> Result<Self, DataError>
2307 where
2308 D: DataProvider<NormalizerNfkdDataV1>
2309 + DataProvider<NormalizerNfdTablesV1>
2310 + DataProvider<NormalizerNfkdTablesV1>
2311 + ?Sized,
2312 {
2313 let decompositions: DataPayload<NormalizerNfkdDataV1> =
2314 provider.load(Default::default())?.payload;
2315 let tables: DataPayload<NormalizerNfdTablesV1> = provider.load(Default::default())?.payload;
2316 let supplementary_tables: DataPayload<NormalizerNfkdTablesV1> =
2317 provider.load(Default::default())?.payload;
2318
2319 if tables.get().scalars16.len()
2320 + tables.get().scalars24.len()
2321 + supplementary_tables.get().scalars16.len()
2322 + supplementary_tables.get().scalars24.len()
2323 > 0xFFF
2324 {
2325 // The data is from a future where there exists a normalization flavor whose
2326 // complex decompositions take more than 0xFFF but fewer than 0x1FFF code points
2327 // of space. If a good use case from such a decomposition flavor arises, we can
2328 // dynamically change the bit masks so that the length mask becomes 0x1FFF instead
2329 // of 0xFFF and the all-non-starters mask becomes 0 instead of 0x1000. However,
2330 // since for now the masks are hard-coded, error out.
2331 return Err(
2332 DataError::custom("future extension").with_marker(NormalizerNfdTablesV1::INFO)
2333 );
2334 }
2335
2336 let cap = decompositions.get().passthrough_cap;
2337 if cap > 0x0300 {
2338 return Err(DataError::custom("invalid").with_marker(NormalizerNfkdDataV1::INFO));
2339 }
2340 let decomposition_capped = cap.min(0xC0);
2341 let composition_capped = cap.min(0x0300);
2342
2343 Ok(DecomposingNormalizer {
2344 decompositions: decompositions.cast(),
2345 tables,
2346 supplementary_tables: Some(supplementary_tables),
2347 decomposition_passthrough_bound: decomposition_capped as u8,
2348 composition_passthrough_bound: composition_capped,
2349 })
2350 }
2351
2352 /// UTS 46 decomposed constructor (testing only)
2353 ///
2354 /// This is a special building block normalization for IDNA. It is the decomposed counterpart of
2355 /// ICU4C's UTS 46 normalization with two exceptions: characters that UTS 46 disallows and
2356 /// ICU4C maps to U+FFFD and characters that UTS 46 maps to the empty string normalize as in
2357 /// NFD in this normalization. In both cases, the previous UTS 46 processing before using
2358 /// normalization is expected to deal with these characters. Making the disallowed characters
2359 /// behave like this is beneficial to data size, and this normalizer implementation cannot
2360 /// deal with a character normalizing to the empty string, which doesn't happen in NFD or
2361 /// NFKD as of Unicode 14.
2362 ///
2363 /// Warning: In this normalization, U+0345 COMBINING GREEK YPOGEGRAMMENI exhibits a behavior
2364 /// that no character in Unicode exhibits in NFD, NFKD, NFC, or NFKC: Case folding turns
2365 /// U+0345 from a reordered character into a non-reordered character before reordering happens.
2366 /// Therefore, the output of this normalization may differ for different inputs that are
2367 /// canonically equivalent with each other if they differ by how U+0345 is ordered relative
2368 /// to other reorderable characters.
2369 pub(crate) fn try_new_uts46_decomposed_unstable<D>(provider: &D) -> Result<Self, DataError>
2370 where
2371 D: DataProvider<NormalizerUts46DataV1>
2372 + DataProvider<NormalizerNfdTablesV1>
2373 + DataProvider<NormalizerNfkdTablesV1>
2374 // UTS 46 tables merged into CompatibilityDecompositionTablesV1
2375 + ?Sized,
2376 {
2377 let decompositions: DataPayload<NormalizerUts46DataV1> =
2378 provider.load(Default::default())?.payload;
2379 let tables: DataPayload<NormalizerNfdTablesV1> = provider.load(Default::default())?.payload;
2380 let supplementary_tables: DataPayload<NormalizerNfkdTablesV1> =
2381 provider.load(Default::default())?.payload;
2382
2383 if tables.get().scalars16.len()
2384 + tables.get().scalars24.len()
2385 + supplementary_tables.get().scalars16.len()
2386 + supplementary_tables.get().scalars24.len()
2387 > 0xFFF
2388 {
2389 // The data is from a future where there exists a normalization flavor whose
2390 // complex decompositions take more than 0xFFF but fewer than 0x1FFF code points
2391 // of space. If a good use case from such a decomposition flavor arises, we can
2392 // dynamically change the bit masks so that the length mask becomes 0x1FFF instead
2393 // of 0xFFF and the all-non-starters mask becomes 0 instead of 0x1000. However,
2394 // since for now the masks are hard-coded, error out.
2395 return Err(
2396 DataError::custom("future extension").with_marker(NormalizerNfdTablesV1::INFO)
2397 );
2398 }
2399
2400 let cap = decompositions.get().passthrough_cap;
2401 if cap > 0x0300 {
2402 return Err(DataError::custom("invalid").with_marker(NormalizerUts46DataV1::INFO));
2403 }
2404 let decomposition_capped = cap.min(0xC0);
2405 let composition_capped = cap.min(0x0300);
2406
2407 Ok(DecomposingNormalizer {
2408 decompositions: decompositions.cast(),
2409 tables,
2410 supplementary_tables: Some(supplementary_tables),
2411 decomposition_passthrough_bound: decomposition_capped as u8,
2412 composition_passthrough_bound: composition_capped,
2413 })
2414 }
2415}
2416
2417/// Borrowed version of a normalizer for performing composing normalization.
2418#[derive(Debug)]
2419pub struct ComposingNormalizerBorrowed<'a> {
2420 decomposing_normalizer: DecomposingNormalizerBorrowed<'a>,
2421 canonical_compositions: &'a CanonicalCompositions<'a>,
2422}
2423
2424impl ComposingNormalizerBorrowed<'static> {
2425 /// Cheaply converts a [`ComposingNormalizerBorrowed<'static>`] into a [`ComposingNormalizer`].
2426 ///
2427 /// Note: Due to branching and indirection, using [`ComposingNormalizer`] might inhibit some
2428 /// compile-time optimizations that are possible with [`ComposingNormalizerBorrowed`].
2429 pub const fn static_to_owned(self) -> ComposingNormalizer {
2430 ComposingNormalizer {
2431 decomposing_normalizer: self.decomposing_normalizer.static_to_owned(),
2432 canonical_compositions: DataPayload::from_static_ref(self.canonical_compositions),
2433 }
2434 }
2435
2436 /// NFC constructor using compiled data.
2437 ///
2438 /// ✨ *Enabled with the `compiled_data` Cargo feature.*
2439 ///
2440 /// [📚 Help choosing a constructor](icu_provider::constructors)
2441 #[cfg(feature = "compiled_data")]
2442 pub const fn new_nfc() -> Self {
2443 ComposingNormalizerBorrowed {
2444 decomposing_normalizer: DecomposingNormalizerBorrowed::new_nfd(),
2445 canonical_compositions: provider::Baked::SINGLETON_NORMALIZER_NFC_V1,
2446 }
2447 }
2448
2449 /// NFKC constructor using compiled data.
2450 ///
2451 /// ✨ *Enabled with the `compiled_data` Cargo feature.*
2452 ///
2453 /// [📚 Help choosing a constructor](icu_provider::constructors)
2454 #[cfg(feature = "compiled_data")]
2455 pub const fn new_nfkc() -> Self {
2456 ComposingNormalizerBorrowed {
2457 decomposing_normalizer: DecomposingNormalizerBorrowed::new_nfkd(),
2458 canonical_compositions: provider::Baked::SINGLETON_NORMALIZER_NFC_V1,
2459 }
2460 }
2461
2462 /// This is a special building block normalization for IDNA that implements parts of the Map
2463 /// step and the following Normalize step.
2464 ///
2465 /// Warning: In this normalization, U+0345 COMBINING GREEK YPOGEGRAMMENI exhibits a behavior
2466 /// that no character in Unicode exhibits in NFD, NFKD, NFC, or NFKC: Case folding turns
2467 /// U+0345 from a reordered character into a non-reordered character before reordering happens.
2468 /// Therefore, the output of this normalization may differ for different inputs that are
2469 /// canonically equivalents with each other if they differ by how U+0345 is ordered relative
2470 /// to other reorderable characters.
2471 #[cfg(feature = "compiled_data")]
2472 pub(crate) const fn new_uts46() -> Self {
2473 ComposingNormalizerBorrowed {
2474 decomposing_normalizer: DecomposingNormalizerBorrowed::new_uts46_decomposed(),
2475 canonical_compositions: provider::Baked::SINGLETON_NORMALIZER_NFC_V1,
2476 }
2477 }
2478}
2479
2480impl<'data> ComposingNormalizerBorrowed<'data> {
2481 /// Wraps a delegate iterator into a composing iterator
2482 /// adapter by using the data already held by this normalizer.
2483 pub fn normalize_iter<I: Iterator<Item = char>>(&self, iter: I) -> Composition<'data, I> {
2484 self.normalize_iter_private(iter, IgnorableBehavior::Unsupported)
2485 }
2486
2487 fn normalize_iter_private<I: Iterator<Item = char>>(
2488 &self,
2489 iter: I,
2490 ignorable_behavior: IgnorableBehavior,
2491 ) -> Composition<'data, I> {
2492 Composition::new(
2493 Decomposition::new_with_supplements(
2494 iter,
2495 self.decomposing_normalizer.decompositions,
2496 self.decomposing_normalizer.tables,
2497 self.decomposing_normalizer.supplementary_tables,
2498 self.decomposing_normalizer.decomposition_passthrough_bound,
2499 ignorable_behavior,
2500 ),
2501 self.canonical_compositions.canonical_compositions.clone(),
2502 self.decomposing_normalizer.composition_passthrough_bound,
2503 )
2504 }
2505
2506 normalizer_methods!();
2507
2508 composing_normalize_to!(
2509 /// Normalize a string slice into a `Write` sink.
2510 ,
2511 normalize_to,
2512 core::fmt::Write,
2513 &str,
2514 {},
2515 true,
2516 as_str,
2517 {
2518 // Let's hope LICM hoists this outside `'outer`.
2519 let composition_passthrough_byte_bound = if composition_passthrough_bound == 0x300 {
2520 0xCCu8
2521 } else {
2522 // We can make this fancy if a normalization other than NFC where looking at
2523 // non-ASCII lead bytes is worthwhile is ever introduced.
2524 composition_passthrough_bound.min(0x80) as u8
2525 };
2526 // Attributes have to be on blocks, so hoisting all the way here.
2527 #[expect(clippy::unwrap_used)]
2528 'fast: loop {
2529 let mut code_unit_iter = composition.decomposition.delegate.as_str().as_bytes().iter();
2530 'fastest: loop {
2531 if let Some(&upcoming_byte) = code_unit_iter.next() {
2532 if upcoming_byte < composition_passthrough_byte_bound {
2533 // Fast-track succeeded!
2534 continue 'fastest;
2535 }
2536 let Some(remaining_slice) = pending_slice.get(pending_slice.len() - code_unit_iter.as_slice().len() - 1..) else {
2537 // If we ever come here, it's an internal bug. Let's avoid panic code paths in release builds.
2538 debug_assert!(false);
2539 // Throw away the fastest-path result in case of an internal bug.
2540 break 'fastest;
2541 };
2542 composition.decomposition.delegate = remaining_slice.chars();
2543 break 'fastest;
2544 }
2545 // End of stream
2546 sink.write_str(pending_slice)?;
2547 return Ok(());
2548 }
2549 // `unwrap()` OK, because the slice is valid UTF-8 and we know there
2550 // is an upcoming byte.
2551 let upcoming = composition.decomposition.delegate.next().unwrap();
2552 let upcoming_with_trie_value = composition.decomposition.attach_trie_value(upcoming);
2553 if upcoming_with_trie_value.potential_passthrough_and_cannot_combine_backwards() {
2554 // Can't combine backwards, hence a plain (non-backwards-combining)
2555 // starter albeit past `composition_passthrough_bound`
2556
2557 // Fast-track succeeded!
2558 continue 'fast;
2559 }
2560 // We need to fall off the fast path.
2561 composition.decomposition.pending = Some(upcoming_with_trie_value);
2562
2563 // slicing and unwrap OK, because we've just evidently read enough previously.
2564 let mut consumed_so_far = pending_slice[..pending_slice.len() - composition.decomposition.delegate.as_str().len() - upcoming.len_utf8()].chars();
2565 // `unwrap` OK, because we've previously manage to read the previous character
2566 undecomposed_starter = composition.decomposition.attach_trie_value(consumed_so_far.next_back().unwrap());
2567 let consumed_so_far_slice = consumed_so_far.as_str();
2568 sink.write_str(consumed_so_far_slice)?;
2569 break 'fast;
2570 }
2571 },
2572 text,
2573 sink,
2574 composition,
2575 composition_passthrough_bound,
2576 undecomposed_starter,
2577 pending_slice,
2578 len_utf8,
2579 );
2580
2581 composing_normalize_to!(
2582 /// Normalize a slice of potentially-invalid UTF-8 into a `Write` sink.
2583 ///
2584 /// Ill-formed byte sequences are mapped to the REPLACEMENT CHARACTER
2585 /// according to the WHATWG Encoding Standard.
2586 ///
2587 /// ✨ *Enabled with the `utf8_iter` Cargo feature.*
2588 #[cfg(feature = "utf8_iter")]
2589 ,
2590 normalize_utf8_to,
2591 core::fmt::Write,
2592 &[u8],
2593 {},
2594 false,
2595 as_slice,
2596 {
2597 'fast: loop {
2598 if let Some(upcoming) = composition.decomposition.delegate.next() {
2599 if u32::from(upcoming) < composition_passthrough_bound {
2600 // Fast-track succeeded!
2601 continue 'fast;
2602 }
2603 // TODO: Be statically aware of fast/small trie.
2604 let upcoming_with_trie_value = composition.decomposition.attach_trie_value(upcoming);
2605 if upcoming_with_trie_value.potential_passthrough_and_cannot_combine_backwards() {
2606 // Note: The trie value of the REPLACEMENT CHARACTER is
2607 // intentionally formatted to fail the
2608 // `potential_passthrough_and_cannot_combine_backwards`
2609 // test even though it really is a starter that decomposes
2610 // to self and cannot combine backwards. This
2611 // Allows moving the branch on REPLACEMENT CHARACTER
2612 // below this `continue`.
2613 continue 'fast;
2614 }
2615 // We need to fall off the fast path.
2616
2617 // TODO(#2006): Annotate as unlikely
2618 if upcoming == REPLACEMENT_CHARACTER {
2619 // Can't tell if this is an error or a literal U+FFFD in
2620 // the input. Assuming the former to be sure.
2621
2622 // Since the U+FFFD might signify an error, we can't
2623 // assume `upcoming.len_utf8()` for the backoff length.
2624 #[expect(clippy::indexing_slicing)]
2625 let mut consumed_so_far = pending_slice[..pending_slice.len() - composition.decomposition.delegate.as_slice().len()].chars();
2626 let back = consumed_so_far.next_back();
2627 debug_assert_eq!(back, Some(REPLACEMENT_CHARACTER));
2628 let consumed_so_far_slice = consumed_so_far.as_slice();
2629 sink.write_str(unsafe { core::str::from_utf8_unchecked(consumed_so_far_slice) })?;
2630 undecomposed_starter = CharacterAndTrieValue::new(REPLACEMENT_CHARACTER, 0);
2631 composition.decomposition.pending = None;
2632 break 'fast;
2633 }
2634
2635 composition.decomposition.pending = Some(upcoming_with_trie_value);
2636 // slicing and unwrap OK, because we've just evidently read enough previously.
2637 // `unwrap` OK, because we've previously manage to read the previous character
2638 #[expect(clippy::indexing_slicing)]
2639 let mut consumed_so_far = pending_slice[..pending_slice.len() - composition.decomposition.delegate.as_slice().len() - upcoming.len_utf8()].chars();
2640 #[expect(clippy::unwrap_used)]
2641 {
2642 // TODO: If the previous character was below the passthrough bound,
2643 // we really need to read from the trie. Otherwise, we could maintain
2644 // the most-recent trie value. Need to measure what's more expensive:
2645 // Remembering the trie value on each iteration or re-reading the
2646 // last one after the fast-track run.
2647 undecomposed_starter = composition.decomposition.attach_trie_value(consumed_so_far.next_back().unwrap());
2648 }
2649 let consumed_so_far_slice = consumed_so_far.as_slice();
2650 sink.write_str(unsafe { core::str::from_utf8_unchecked(consumed_so_far_slice)})?;
2651 break 'fast;
2652 }
2653 // End of stream
2654 sink.write_str(unsafe { core::str::from_utf8_unchecked(pending_slice) })?;
2655 return Ok(());
2656 }
2657 },
2658 text,
2659 sink,
2660 composition,
2661 composition_passthrough_bound,
2662 undecomposed_starter,
2663 pending_slice,
2664 len_utf8,
2665 );
2666
2667 composing_normalize_to!(
2668 /// Normalize a slice of potentially-invalid UTF-16 into a `Write16` sink.
2669 ///
2670 /// Unpaired surrogates are mapped to the REPLACEMENT CHARACTER
2671 /// before normalizing.
2672 ///
2673 /// ✨ *Enabled with the `utf16_iter` Cargo feature.*
2674 #[cfg(feature = "utf16_iter")]
2675 ,
2676 normalize_utf16_to,
2677 write16::Write16,
2678 &[u16],
2679 {
2680 sink.size_hint(text.len())?;
2681 },
2682 false,
2683 as_slice,
2684 {
2685 // This loop is only broken out of as goto forward and only as release-build recovery from
2686 // detecting an internal bug without panic. (In debug builds, internal bugs panic instead.)
2687 #[expect(clippy::never_loop)]
2688 'fastwrap: loop {
2689 // Commented out `code_unit_iter` and used `ptr` and `end` to
2690 // work around https://github.com/rust-lang/rust/issues/144684 .
2691 //
2692 // let mut code_unit_iter = composition.decomposition.delegate.as_slice().iter();
2693 let delegate_as_slice = composition.decomposition.delegate.as_slice();
2694 let mut ptr: *const u16 = delegate_as_slice.as_ptr();
2695 // SAFETY: materializing a pointer immediately past the end of an
2696 // allocation is OK.
2697 let end: *const u16 = unsafe { ptr.add(delegate_as_slice.len()) };
2698
2699 'fast: loop {
2700 // if let Some(&upcoming_code_unit) = code_unit_iter.next() {
2701 if ptr != end {
2702 // SAFETY: We just checked that `ptr` has not reached `end`.
2703 // `ptr` always advances by one, and we always have a check
2704 // per advancement.
2705 let upcoming_code_unit = unsafe { *ptr };
2706 // SAFETY: Since `ptr` hadn't reached `end`, yet, advancing
2707 // by one points to the same allocation or to immediately
2708 // after, which is OK.
2709 ptr = unsafe { ptr.add(1) };
2710
2711 let mut upcoming32 = u32::from(upcoming_code_unit); // may be surrogate
2712 // The performance of what logically is supposed to be this
2713 // branch is somewhat brittle and what LLVM ends up doing
2714 // that affects the performance of what's logically about this
2715 // decision can swing to double/halve the throughput for Basic
2716 // Latin in ways that are completely unintuitive. Basically _any_
2717 // change to _any_ code that participates in how LLVM sees the
2718 // code around here can make the perf fall over. In seems that
2719 // manually annotating this branch as likely has worse effects
2720 // on non-Basic-Latin input that the case where LLVM just happens to
2721 // do the right thing.
2722 //
2723 // What happens with this branch may depend on what sink type
2724 // this code is monomorphized over.
2725 //
2726 // What a terrible sink of developer time!
2727 if upcoming32 < composition_passthrough_bound {
2728 // No need for surrogate or U+FFFD check, because
2729 // `composition_passthrough_bound` cannot be higher than
2730 // U+0300.
2731 // Fast-track succeeded!
2732 continue 'fast;
2733 }
2734 // We might be doing a trie lookup by surrogate. Surrogates get
2735 // a decomposition to U+FFFD.
2736 let mut trie_value = composition.decomposition.trie.get16(upcoming_code_unit);
2737 if potential_passthrough_and_cannot_combine_backwards_impl(trie_value) {
2738 // Can't combine backwards, hence a plain (non-backwards-combining)
2739 // starter albeit past `composition_passthrough_bound`
2740
2741 // Fast-track succeeded!
2742 continue 'fast;
2743 }
2744
2745 // We might now be looking at a surrogate.
2746 // The loop is only broken out of as goto forward
2747 #[expect(clippy::never_loop)]
2748 'surrogateloop: loop {
2749 // The `likely` annotations _below_ exist to make the code _above_
2750 // go faster!
2751 let surrogate_base = upcoming32.wrapping_sub(0xD800);
2752 if likely(surrogate_base > (0xDFFF - 0xD800)) {
2753 // Not surrogate
2754 break 'surrogateloop;
2755 }
2756 if likely(surrogate_base <= (0xDBFF - 0xD800)) {
2757 // let iter_backup = code_unit_iter.clone();
2758 // if let Some(&low) = code_unit_iter.next() {
2759 if ptr != end {
2760 // SAFETY: We just checked that `ptr` has not reached `end`.
2761 // `ptr` always advances by one, and we always have a check
2762 // per advancement.
2763 let low = unsafe { *ptr };
2764 if likely(in_inclusive_range16(low, 0xDC00, 0xDFFF)) {
2765 // SAFETY: Since `ptr` hadn't reached `end`, yet, advancing
2766 // by one points to the same allocation or to immediately
2767 // after, which is OK.
2768 ptr = unsafe { ptr.add(1) };
2769
2770 upcoming32 = (upcoming32 << 10) + u32::from(low)
2771 - (((0xD800u32 << 10) - 0x10000u32) + 0xDC00u32);
2772 // Successfully-paired surrogate. Read from the trie again.
2773 trie_value = {
2774 // Semantically, this bit of conditional compilation makes no sense.
2775 // The purpose is to keep LLVM seeing the untyped trie case the way
2776 // it did before so as not to regress the performance of the untyped
2777 // case due to unintuitive optimizer effects. If you care about the
2778 // perf of the untyped trie case and have better ideas, please try
2779 // something better.
2780 #[cfg(not(icu4x_unstable_fast_trie_only))]
2781 {composition.decomposition.trie.get32(upcoming32)}
2782 #[cfg(icu4x_unstable_fast_trie_only)]
2783 {composition.decomposition.trie.get32_supplementary(upcoming32)}
2784 };
2785 if likely(potential_passthrough_and_cannot_combine_backwards_impl(trie_value)) {
2786 // Fast-track succeeded!
2787 continue 'fast;
2788 }
2789 break 'surrogateloop;
2790 // } else {
2791 // code_unit_iter = iter_backup;
2792 }
2793 }
2794 }
2795 // unpaired surrogate
2796 upcoming32 = 0xFFFD; // Safe value for `char::from_u32_unchecked` and matches later potential error check.
2797 // trie_value already holds a decomposition to U+FFFD.
2798 debug_assert_eq!(trie_value, NON_ROUND_TRIP_MARKER | BACKWARD_COMBINING_MARKER | 0xFFFD);
2799 break 'surrogateloop;
2800 }
2801
2802 // SAFETY: upcoming32 can no longer be a surrogate.
2803 let upcoming = unsafe { char::from_u32_unchecked(upcoming32) };
2804 let upcoming_with_trie_value = CharacterAndTrieValue::new(upcoming, trie_value);
2805 // We need to fall off the fast path.
2806 composition.decomposition.pending = Some(upcoming_with_trie_value);
2807 let Some(consumed_so_far_slice) = pending_slice.get(..pending_slice.len() -
2808 // code_unit_iter.as_slice().len()
2809 // SAFETY: `ptr` and `end` have been derived from the same allocation
2810 // and `ptr` is never greater than `end`.
2811 unsafe { end.offset_from(ptr) as usize }
2812 - upcoming.len_utf16()) else {
2813 // If we ever come here, it's a bug, but let's avoid panic code paths in release builds.
2814 debug_assert!(false);
2815 // Throw away the results of the fast path.
2816 break 'fastwrap;
2817 };
2818 let mut consumed_so_far = consumed_so_far_slice.chars();
2819 let Some(c_from_back) = consumed_so_far.next_back() else {
2820 // If we ever come here, it's a bug, but let's avoid panic code paths in release builds.
2821 debug_assert!(false);
2822 // Throw away the results of the fast path.
2823 break 'fastwrap;
2824 };
2825 // TODO: If the previous character was below the passthrough bound,
2826 // we really need to read from the trie. Otherwise, we could maintain
2827 // the most-recent trie value. Need to measure what's more expensive:
2828 // Remembering the trie value on each iteration or re-reading the
2829 // last one after the fast-track run.
2830 undecomposed_starter = composition.decomposition.attach_trie_value(c_from_back);
2831 sink.write_slice(consumed_so_far.as_slice())?;
2832 break 'fast;
2833 }
2834 // End of stream
2835 sink.write_slice(pending_slice)?;
2836 return Ok(());
2837 }
2838 // Sync the main iterator
2839 // composition.decomposition.delegate = code_unit_iter.as_slice().chars();
2840 // SAFETY: `ptr` and `end` have been derive from the same allocation
2841 // and `ptr` is never greater than `end`.
2842 composition.decomposition.delegate = unsafe { core::slice::from_raw_parts(ptr, end.offset_from(ptr) as usize) }.chars();
2843 break 'fastwrap;
2844 }
2845 },
2846 text,
2847 sink,
2848 composition,
2849 composition_passthrough_bound,
2850 undecomposed_starter,
2851 pending_slice,
2852 len_utf16,
2853 );
2854}
2855
2856/// A normalizer for performing composing normalization.
2857#[derive(Debug)]
2858pub struct ComposingNormalizer {
2859 decomposing_normalizer: DecomposingNormalizer,
2860 canonical_compositions: DataPayload<NormalizerNfcV1>,
2861}
2862
2863impl ComposingNormalizer {
2864 /// Constructs a borrowed version of this type for more efficient querying.
2865 pub fn as_borrowed(&self) -> ComposingNormalizerBorrowed<'_> {
2866 ComposingNormalizerBorrowed {
2867 decomposing_normalizer: self.decomposing_normalizer.as_borrowed(),
2868 canonical_compositions: self.canonical_compositions.get(),
2869 }
2870 }
2871
2872 /// NFC constructor using compiled data.
2873 ///
2874 /// ✨ *Enabled with the `compiled_data` Cargo feature.*
2875 ///
2876 /// [📚 Help choosing a constructor](icu_provider::constructors)
2877 #[cfg(feature = "compiled_data")]
2878 pub const fn new_nfc() -> ComposingNormalizerBorrowed<'static> {
2879 ComposingNormalizerBorrowed::new_nfc()
2880 }
2881
2882 icu_provider::gen_buffer_data_constructors!(
2883 () -> error: DataError,
2884 functions: [
2885 new_nfc: skip,
2886 try_new_nfc_with_buffer_provider,
2887 try_new_nfc_unstable,
2888 Self,
2889 ]
2890 );
2891
2892 #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_nfc)]
2893 pub fn try_new_nfc_unstable<D>(provider: &D) -> Result<Self, DataError>
2894 where
2895 D: DataProvider<NormalizerNfdDataV1>
2896 + DataProvider<NormalizerNfdTablesV1>
2897 + DataProvider<NormalizerNfcV1>
2898 + ?Sized,
2899 {
2900 let decomposing_normalizer = DecomposingNormalizer::try_new_nfd_unstable(provider)?;
2901
2902 let canonical_compositions: DataPayload<NormalizerNfcV1> =
2903 provider.load(Default::default())?.payload;
2904
2905 Ok(ComposingNormalizer {
2906 decomposing_normalizer,
2907 canonical_compositions,
2908 })
2909 }
2910
2911 /// NFKC constructor using compiled data.
2912 ///
2913 /// ✨ *Enabled with the `compiled_data` Cargo feature.*
2914 ///
2915 /// [📚 Help choosing a constructor](icu_provider::constructors)
2916 #[cfg(feature = "compiled_data")]
2917 pub const fn new_nfkc() -> ComposingNormalizerBorrowed<'static> {
2918 ComposingNormalizerBorrowed::new_nfkc()
2919 }
2920
2921 icu_provider::gen_buffer_data_constructors!(
2922 () -> error: DataError,
2923 functions: [
2924 new_nfkc: skip,
2925 try_new_nfkc_with_buffer_provider,
2926 try_new_nfkc_unstable,
2927 Self,
2928 ]
2929 );
2930
2931 #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_nfkc)]
2932 pub fn try_new_nfkc_unstable<D>(provider: &D) -> Result<Self, DataError>
2933 where
2934 D: DataProvider<NormalizerNfkdDataV1>
2935 + DataProvider<NormalizerNfdTablesV1>
2936 + DataProvider<NormalizerNfkdTablesV1>
2937 + DataProvider<NormalizerNfcV1>
2938 + ?Sized,
2939 {
2940 let decomposing_normalizer = DecomposingNormalizer::try_new_nfkd_unstable(provider)?;
2941
2942 let canonical_compositions: DataPayload<NormalizerNfcV1> =
2943 provider.load(Default::default())?.payload;
2944
2945 Ok(ComposingNormalizer {
2946 decomposing_normalizer,
2947 canonical_compositions,
2948 })
2949 }
2950
2951 #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_uts46)]
2952 pub(crate) fn try_new_uts46_unstable<D>(provider: &D) -> Result<Self, DataError>
2953 where
2954 D: DataProvider<NormalizerUts46DataV1>
2955 + DataProvider<NormalizerNfdTablesV1>
2956 + DataProvider<NormalizerNfkdTablesV1>
2957 // UTS 46 tables merged into CompatibilityDecompositionTablesV1
2958 + DataProvider<NormalizerNfcV1>
2959 + ?Sized,
2960 {
2961 let decomposing_normalizer =
2962 DecomposingNormalizer::try_new_uts46_decomposed_unstable(provider)?;
2963
2964 let canonical_compositions: DataPayload<NormalizerNfcV1> =
2965 provider.load(Default::default())?.payload;
2966
2967 Ok(ComposingNormalizer {
2968 decomposing_normalizer,
2969 canonical_compositions,
2970 })
2971 }
2972}
2973
2974#[cfg(feature = "utf16_iter")]
2975struct IsNormalizedSinkUtf16<'a> {
2976 expect: &'a [u16],
2977}
2978
2979#[cfg(feature = "utf16_iter")]
2980impl<'a> IsNormalizedSinkUtf16<'a> {
2981 pub fn new(slice: &'a [u16]) -> Self {
2982 IsNormalizedSinkUtf16 { expect: slice }
2983 }
2984 pub fn remaining_len(&self) -> usize {
2985 self.expect.len()
2986 }
2987}
2988
2989#[cfg(feature = "utf16_iter")]
2990impl write16::Write16 for IsNormalizedSinkUtf16<'_> {
2991 fn write_slice(&mut self, s: &[u16]) -> core::fmt::Result {
2992 // We know that if we get a slice, it's a pass-through,
2993 // so we can compare addresses. Indexing is OK, because
2994 // an indexing failure would be a code bug rather than
2995 // an input or data issue.
2996 #[expect(clippy::indexing_slicing)]
2997 if core::ptr::eq(s.as_ptr(), self.expect.as_ptr()) {
2998 self.expect = &self.expect[s.len()..];
2999 Ok(())
3000 } else {
3001 Err(core::fmt::Error {})
3002 }
3003 }
3004
3005 fn write_char(&mut self, c: char) -> core::fmt::Result {
3006 let mut iter = self.expect.chars();
3007 if iter.next() == Some(c) {
3008 self.expect = iter.as_slice();
3009 Ok(())
3010 } else {
3011 Err(core::fmt::Error {})
3012 }
3013 }
3014}
3015
3016#[cfg(feature = "utf8_iter")]
3017struct IsNormalizedSinkUtf8<'a> {
3018 expect: &'a [u8],
3019}
3020
3021#[cfg(feature = "utf8_iter")]
3022impl<'a> IsNormalizedSinkUtf8<'a> {
3023 pub fn new(slice: &'a [u8]) -> Self {
3024 IsNormalizedSinkUtf8 { expect: slice }
3025 }
3026 pub fn remaining_len(&self) -> usize {
3027 self.expect.len()
3028 }
3029}
3030
3031#[cfg(feature = "utf8_iter")]
3032impl core::fmt::Write for IsNormalizedSinkUtf8<'_> {
3033 fn write_str(&mut self, s: &str) -> core::fmt::Result {
3034 // We know that if we get a slice, it's a pass-through,
3035 // so we can compare addresses. Indexing is OK, because
3036 // an indexing failure would be a code bug rather than
3037 // an input or data issue.
3038 #[expect(clippy::indexing_slicing)]
3039 if core::ptr::eq(s.as_ptr(), self.expect.as_ptr()) {
3040 self.expect = &self.expect[s.len()..];
3041 Ok(())
3042 } else {
3043 Err(core::fmt::Error {})
3044 }
3045 }
3046
3047 fn write_char(&mut self, c: char) -> core::fmt::Result {
3048 let mut iter = self.expect.chars();
3049 if iter.next() == Some(c) {
3050 self.expect = iter.as_slice();
3051 Ok(())
3052 } else {
3053 Err(core::fmt::Error {})
3054 }
3055 }
3056}
3057
3058struct IsNormalizedSinkStr<'a> {
3059 expect: &'a str,
3060}
3061
3062impl<'a> IsNormalizedSinkStr<'a> {
3063 pub fn new(slice: &'a str) -> Self {
3064 IsNormalizedSinkStr { expect: slice }
3065 }
3066 pub fn remaining_len(&self) -> usize {
3067 self.expect.len()
3068 }
3069}
3070
3071impl core::fmt::Write for IsNormalizedSinkStr<'_> {
3072 fn write_str(&mut self, s: &str) -> core::fmt::Result {
3073 // We know that if we get a slice, it's a pass-through,
3074 // so we can compare addresses. Indexing is OK, because
3075 // an indexing failure would be a code bug rather than
3076 // an input or data issue.
3077 if core::ptr::eq(s.as_ptr(), self.expect.as_ptr()) {
3078 self.expect = &self.expect[s.len()..];
3079 Ok(())
3080 } else {
3081 Err(core::fmt::Error {})
3082 }
3083 }
3084
3085 fn write_char(&mut self, c: char) -> core::fmt::Result {
3086 let mut iter = self.expect.chars();
3087 if iter.next() == Some(c) {
3088 self.expect = iter.as_str();
3089 Ok(())
3090 } else {
3091 Err(core::fmt::Error {})
3092 }
3093 }
3094}