icu_collator/
comparison.rs

1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5// Various collation-related algorithms and constants in this file are
6// adapted from ICU4C and, therefore, are subject to the ICU license as
7// described in LICENSE.
8
9//! This module holds the `Collator` struct whose `compare_impl()` contains
10//! the comparison of collation element sequences.
11
12use alloc::collections::VecDeque;
13use alloc::vec::Vec;
14
15use crate::elements::CharacterAndClassAndTrieValue;
16use crate::elements::CollationElement32;
17use crate::elements::Tag;
18use crate::elements::BACKWARD_COMBINING_MARKER;
19use crate::elements::CE_BUFFER_SIZE;
20use crate::elements::FALLBACK_CE32;
21use crate::elements::NON_ROUND_TRIP_MARKER;
22use crate::elements::{
23    char_from_u32, CollationElement, CollationElements, NonPrimary, FFFD_CE32,
24    HANGUL_SYLLABLE_MARKER, HIGH_ZEROS_MASK, JAMO_COUNT, LOW_ZEROS_MASK, NO_CE, NO_CE_PRIMARY,
25    NO_CE_QUATERNARY, NO_CE_SECONDARY, NO_CE_TERTIARY, OPTIMIZED_DIACRITICS_MAX_COUNT,
26    QUATERNARY_MASK,
27};
28use crate::options::CollatorOptionsBitField;
29use crate::options::{
30    AlternateHandling, CollatorOptions, MaxVariable, ResolvedCollatorOptions, Strength,
31};
32use crate::preferences::{CollationCaseFirst, CollationNumericOrdering, CollationType};
33use crate::provider::CollationData;
34use crate::provider::CollationDiacritics;
35use crate::provider::CollationDiacriticsV1;
36use crate::provider::CollationJamo;
37use crate::provider::CollationJamoV1;
38use crate::provider::CollationMetadataV1;
39use crate::provider::CollationReordering;
40use crate::provider::CollationReorderingV1;
41use crate::provider::CollationRootV1;
42use crate::provider::CollationSpecialPrimariesV1;
43use crate::provider::CollationSpecialPrimariesValidated;
44use crate::provider::CollationTailoringV1;
45use core::cmp::Ordering;
46use core::convert::{Infallible, TryFrom};
47use icu_normalizer::provider::DecompositionData;
48use icu_normalizer::provider::DecompositionTables;
49use icu_normalizer::provider::NormalizerNfdDataV1;
50use icu_normalizer::provider::NormalizerNfdTablesV1;
51use icu_normalizer::DecomposingNormalizerBorrowed;
52use icu_normalizer::Decomposition;
53use icu_provider::marker::ErasedMarker;
54use icu_provider::prelude::*;
55use smallvec::SmallVec;
56use utf16_iter::Utf16CharsEx;
57use utf8_iter::Utf8CharsEx;
58use zerovec::ule::AsULE;
59
60// Special sort key bytes for all levels.
61const LEVEL_SEPARATOR_BYTE: u8 = 1;
62
63/// Merge-sort-key separator.
64///
65/// Same as the unique primary and identical-level weights of U+FFFE.  Must not
66/// be used as primary compression low terminator.  Otherwise usable.
67const MERGE_SEPARATOR: char = '\u{fffe}';
68const MERGE_SEPARATOR_BYTE: u8 = 2;
69const MERGE_SEPARATOR_PRIMARY: u32 = 0x02000000;
70
71/// Primary compression low terminator, must be greater than MERGE_SEPARATOR_BYTE.
72///
73/// Reserved value in primary second byte if the lead byte is compressible.
74/// Otherwise usable in all CE weight bytes.
75const PRIMARY_COMPRESSION_LOW_BYTE: u8 = 3;
76
77/// Primary compression high terminator.
78///
79/// Reserved value in primary second byte if the lead byte is compressible.
80/// Otherwise usable in all CE weight bytes.
81const PRIMARY_COMPRESSION_HIGH_BYTE: u8 = 0xff;
82
83/// Default secondary/tertiary weight lead byte.
84const COMMON_BYTE: u8 = 5;
85const COMMON_WEIGHT16: u16 = 0x0500;
86
87// Internal flags for sort key generation
88const PRIMARY_LEVEL_FLAG: u8 = 0x01;
89const SECONDARY_LEVEL_FLAG: u8 = 0x02;
90const CASE_LEVEL_FLAG: u8 = 0x04;
91const TERTIARY_LEVEL_FLAG: u8 = 0x08;
92const QUATERNARY_LEVEL_FLAG: u8 = 0x10;
93
94const LEVEL_MASKS: [u8; Strength::Identical as usize + 1] = [
95    PRIMARY_LEVEL_FLAG,
96    PRIMARY_LEVEL_FLAG | SECONDARY_LEVEL_FLAG,
97    PRIMARY_LEVEL_FLAG | SECONDARY_LEVEL_FLAG | TERTIARY_LEVEL_FLAG,
98    PRIMARY_LEVEL_FLAG | SECONDARY_LEVEL_FLAG | TERTIARY_LEVEL_FLAG | QUATERNARY_LEVEL_FLAG,
99    0,
100    0,
101    0,
102    PRIMARY_LEVEL_FLAG | SECONDARY_LEVEL_FLAG | TERTIARY_LEVEL_FLAG | QUATERNARY_LEVEL_FLAG,
103];
104
105// Internal constants for indexing into the below compression configurations
106const WEIGHT_LOW: usize = 0;
107const WEIGHT_MIDDLE: usize = 1;
108const WEIGHT_HIGH: usize = 2;
109const WEIGHT_MAX_COUNT: usize = 3;
110
111// Secondary level: Compress up to 33 common weights as 05..25 or 25..45.
112const SEC_COMMON: [u8; 4] = [COMMON_BYTE, COMMON_BYTE + 0x20, COMMON_BYTE + 0x40, 0x21];
113
114// Case level, lowerFirst: Compress up to 7 common weights as 1..7 or 7..13.
115const CASE_LOWER_FIRST_COMMON: [u8; 4] = [1, 7, 13, 7];
116
117// Case level, upperFirst: Compress up to 13 common weights as 3..15.
118const CASE_UPPER_FIRST_COMMON: [u8; 4] = [3, 0 /* unused */, 15, 13];
119
120// Tertiary level only (no case): Compress up to 97 common weights as 05..65 or 65..C5.
121const TER_ONLY_COMMON: [u8; 4] = [COMMON_BYTE, COMMON_BYTE + 0x60, COMMON_BYTE + 0xc0, 0x61];
122
123// Tertiary with case, lowerFirst: Compress up to 33 common weights as 05..25 or 25..45.
124const TER_LOWER_FIRST_COMMON: [u8; 4] = [COMMON_BYTE, COMMON_BYTE + 0x20, COMMON_BYTE + 0x40, 0x21];
125
126// Tertiary with case, upperFirst: Compress up to 33 common weights as 85..A5 or A5..C5.
127const TER_UPPER_FIRST_COMMON: [u8; 4] = [
128    COMMON_BYTE + 0x80,
129    COMMON_BYTE + 0x80 + 0x20,
130    COMMON_BYTE + 0x80 + 0x40,
131    0x21,
132];
133
134const QUAT_COMMON: [u8; 4] = [0x1c, 0x1c + 0x70, 0x1c + 0xe0, 0x71];
135const QUAT_SHIFTED_LIMIT_BYTE: u8 = QUAT_COMMON[WEIGHT_LOW] - 1; // 0x1b
136
137// Do not use byte values 0, 1, 2 because they are separators in sort keys.
138const SLOPE_MIN: i32 = 3;
139const SLOPE_MAX: i32 = 0xff;
140const SLOPE_MIDDLE: i32 = 0x81;
141const SLOPE_TAIL_COUNT: i32 = SLOPE_MAX - SLOPE_MIN + 1;
142const SLOPE_SINGLE: i32 = 80;
143const SLOPE_LEAD_2: i32 = 42;
144const SLOPE_LEAD_3: i32 = 3;
145
146// The difference value range for single-byters.
147const SLOPE_REACH_POS_1: i32 = SLOPE_SINGLE;
148const SLOPE_REACH_NEG_1: i32 = -SLOPE_SINGLE;
149
150// The difference value range for double-byters.
151const SLOPE_REACH_POS_2: i32 = SLOPE_LEAD_2 * SLOPE_TAIL_COUNT + (SLOPE_LEAD_2 - 1);
152const SLOPE_REACH_NEG_2: i32 = -SLOPE_REACH_POS_2 - 1;
153
154// The difference value range for 3-byters.
155const SLOPE_REACH_POS_3: i32 = SLOPE_LEAD_3 * SLOPE_TAIL_COUNT * SLOPE_TAIL_COUNT
156    + (SLOPE_LEAD_3 - 1) * SLOPE_TAIL_COUNT
157    + (SLOPE_TAIL_COUNT - 1);
158const SLOPE_REACH_NEG_3: i32 = -SLOPE_REACH_POS_3 - 1;
159
160// The lead byte start values.
161const SLOPE_START_POS_2: i32 = SLOPE_MIDDLE + SLOPE_SINGLE + 1;
162const SLOPE_START_POS_3: i32 = SLOPE_START_POS_2 + SLOPE_LEAD_2;
163const SLOPE_START_NEG_2: i32 = SLOPE_MIDDLE + SLOPE_REACH_NEG_1;
164const SLOPE_START_NEG_3: i32 = SLOPE_START_NEG_2 - SLOPE_LEAD_2;
165
166struct AnyQuaternaryAccumulator(u32);
167
168impl AnyQuaternaryAccumulator {
169    #[inline(always)]
170    pub fn new() -> Self {
171        AnyQuaternaryAccumulator(0)
172    }
173    #[inline(always)]
174    pub fn accumulate(&mut self, non_primary: NonPrimary) {
175        self.0 |= non_primary.bits()
176    }
177    #[inline(always)]
178    pub fn has_quaternary(&self) -> bool {
179        self.0 & u32::from(QUATERNARY_MASK) != 0
180    }
181}
182
183/// `true` iff `i` is greater or equal to `start` and less or equal
184/// to `end`.
185#[inline(always)]
186fn in_inclusive_range16(i: u16, start: u16, end: u16) -> bool {
187    i.wrapping_sub(start) <= (end - start)
188}
189
190/// Helper trait for getting a `char` iterator from Latin1 data.
191///
192/// ✨ *Enabled with the `latin1` Cargo feature.*
193#[cfg(feature = "latin1")]
194trait Latin1Chars {
195    fn latin1_chars(&self) -> impl DoubleEndedIterator<Item = char>;
196}
197
198#[cfg(feature = "latin1")]
199impl Latin1Chars for [u8] {
200    fn latin1_chars(&self) -> impl DoubleEndedIterator<Item = char> {
201        self.iter().map(|b| char::from(*b))
202    }
203}
204
205/// Finds the identical prefix of `left` and `right` containing
206/// Latin1.
207///
208/// Returns the identical prefix, the part of `left` after the
209/// prefix, and the part of `right` after the prefix.
210///
211/// ✨ *Enabled with the `latin1` Cargo feature.*
212#[cfg(feature = "latin1")]
213fn split_prefix_latin1<'a, 'b>(left: &'a [u8], right: &'b [u8]) -> (&'a [u8], &'a [u8], &'b [u8]) {
214    let i = left
215        .iter()
216        .zip(right.iter())
217        .take_while(|(l, r)| l == r)
218        .count();
219    if let Some((head, left_tail)) = left.split_at_checked(i) {
220        if let Some(right_tail) = right.get(i..) {
221            return (head, left_tail, right_tail);
222        }
223    }
224    (&[], left, right)
225}
226
227/// Finds the identical prefix of `left` containing Latin1
228/// and `right` containing potentially ill-formed UTF-16.
229///
230/// Returns the identical prefix, the part of `left` after the
231/// prefix, and the part of `right` after the prefix.
232///
233/// ✨ *Enabled with the `latin1` Cargo feature.*
234#[cfg(feature = "latin1")]
235fn split_prefix_latin1_utf16<'a, 'b>(
236    left: &'a [u8],
237    right: &'b [u16],
238) -> (&'a [u8], &'a [u8], &'b [u16]) {
239    let i = left
240        .iter()
241        .zip(right.iter())
242        .take_while(|(l, r)| u16::from(**l) == **r)
243        .count();
244    if let Some((head, left_tail)) = left.split_at_checked(i) {
245        if let Some(right_tail) = right.get(i..) {
246            return (head, left_tail, right_tail);
247        }
248    }
249    (&[], left, right)
250}
251
252/// Finds the identical prefix of `left` and `right` containing
253/// potentially ill-formed UTF-16, while avoiding splitting a
254/// well-formed surrogate pair. In case of ill-formed
255/// UTF-16, the prefix is not guaranteed to be maximal.
256///
257/// Returns the identical prefix, the part of `left` after the
258/// prefix, and the part of `right` after the prefix.
259fn split_prefix_u16<'a, 'b>(
260    left: &'a [u16],
261    right: &'b [u16],
262) -> (&'a [u16], &'a [u16], &'b [u16]) {
263    let mut i = left
264        .iter()
265        .zip(right.iter())
266        .take_while(|(l, r)| l == r)
267        .count();
268    if i != 0 {
269        if let Some(&last) = left.get(i.wrapping_sub(1)) {
270            if in_inclusive_range16(last, 0xD800, 0xDBFF) {
271                i -= 1;
272            }
273            if let Some((head, left_tail)) = left.split_at_checked(i) {
274                if let Some(right_tail) = right.get(i..) {
275                    return (head, left_tail, right_tail);
276                }
277            }
278        }
279    }
280    (&[], left, right)
281}
282
283/// Finds the identical prefix of `left` and `right` containing
284/// potentially ill-formed UTF-8, while avoiding splitting a UTF-8
285/// byte sequence. In case of ill-formed UTF-8, the prefix is
286/// not guaranteed to be maximal.
287///
288/// Returns the identical prefix, the part of `left` after the
289/// prefix, and the part of `right` after the prefix.
290fn split_prefix_u8<'a, 'b>(left: &'a [u8], right: &'b [u8]) -> (&'a [u8], &'a [u8], &'b [u8]) {
291    let mut i = left
292        .iter()
293        .zip(right.iter())
294        .take_while(|(l, r)| l == r)
295        .count();
296    if i != 0 {
297        // Tails must not start with a UTF-8 continuation
298        // byte unless it's the first byte of the original
299        // slice.
300
301        // First, left and right differ, but since they
302        // are the same afterwards, one of them needs checking
303        // only once.
304        if let Some(right_first) = right.get(i) {
305            if (right_first & 0b1100_0000) == 0b1000_0000 {
306                i -= 1;
307            }
308        }
309        while i != 0 {
310            if let Some(left_first) = left.get(i) {
311                if (left_first & 0b1100_0000) == 0b1000_0000 {
312                    i -= 1;
313                    continue;
314                }
315            }
316            break;
317        }
318        if let Some((head, left_tail)) = left.split_at_checked(i) {
319            if let Some(right_tail) = right.get(i..) {
320                return (head, left_tail, right_tail);
321            }
322        }
323    }
324    (&[], left, right)
325}
326
327/// Finds the identical prefix of `left` and `right` containing
328/// guaranteed well-format UTF-8.
329///
330/// Returns the identical prefix, the part of `left` after the
331/// prefix, and the part of `right` after the prefix.
332fn split_prefix<'a, 'b>(left: &'a str, right: &'b str) -> (&'a str, &'a str, &'b str) {
333    let left_bytes = left.as_bytes();
334    let right_bytes = right.as_bytes();
335    let mut i = left_bytes
336        .iter()
337        .zip(right_bytes.iter())
338        .take_while(|(l, r)| l == r)
339        .count();
340    if i != 0 {
341        // Tails must not start with a UTF-8 continuation
342        // byte.
343
344        // Since the inputs are valid UTF-8, the first byte
345        // of either input slice cannot be a contination slice,
346        // so we may rely on finding a lead byte when walking
347        // backwards.
348
349        // Since the inputs are valid UTF-8, if a tail starts
350        // with a continuation, both tails must start with a
351        // continuation, since the most recent lead byte must
352        // be equal, so the difference is within valid UTF-8
353        // sequences of equal length.
354
355        // Therefore, it's sufficient to examine only one of
356        // the sides.
357        loop {
358            if let Some(left_first) = left_bytes.get(i) {
359                if (left_first & 0b1100_0000) == 0b1000_0000 {
360                    i -= 1;
361                    continue;
362                }
363            }
364            break;
365        }
366        // The methods below perform useless UTF-8 boundary checks,
367        // since we just checked. However, avoiding `unsafe` to
368        // make this code easier to audit.
369        if let Some((head, left_tail)) = left.split_at_checked(i) {
370            if let Some(right_tail) = right.get(i..) {
371                return (head, left_tail, right_tail);
372            }
373        }
374    }
375    ("", left, right)
376}
377
378/// Holder struct for payloads that are locale-dependent. (For code
379/// reuse between owned and borrowed cases.)
380#[derive(Debug)]
381struct LocaleSpecificDataHolder {
382    tailoring: Option<DataPayload<CollationTailoringV1>>,
383    diacritics: DataPayload<CollationDiacriticsV1>,
384    reordering: Option<DataPayload<CollationReorderingV1>>,
385    merged_options: CollatorOptionsBitField,
386    lithuanian_dot_above: bool,
387}
388
389icu_locale_core::preferences::define_preferences!(
390    /// The preferences for collation.
391    ///
392    /// # Preferences
393    ///
394    /// Examples for using the different preferences below can be found in the [crate-level docs](crate).
395    ///
396    /// ## Case First
397    ///
398    /// See the [spec](https://www.unicode.org/reports/tr35/tr35-collation.html#Case_Parameters).
399    /// This is the BCP47 key `kf`. Three possibilities: [`CollationCaseFirst::False`] (default,
400    /// except for Danish and Maltese), [`CollationCaseFirst::Lower`], and [`CollationCaseFirst::Upper`]
401    /// (default for Danish and Maltese).
402    ///
403    /// ## Numeric
404    ///
405    /// This is the BCP47 key `kn`. When set to [`CollationNumericOrdering::True`], any sequence of decimal
406    /// digits (General_Category = Nd) is sorted at the primary level according to the
407    /// numeric value. The default is [`CollationNumericOrdering::False`].
408    [Copy]
409    CollatorPreferences,
410    {
411        /// The collation type. This corresponds to the `-u-co` BCP-47 tag.
412        collation_type: CollationType,
413        /// Treatment of case. (Large and small kana differences are treated as case differences.)
414        /// This corresponds to the `-u-kf` BCP-47 tag.
415        case_first: CollationCaseFirst,
416        /// When set to `True`, any sequence of decimal digits is sorted at a primary level according
417        /// to the numeric value.
418        /// This corresponds to the `-u-kn` BPC-47 tag.
419        numeric_ordering: CollationNumericOrdering
420    }
421);
422
423impl LocaleSpecificDataHolder {
424    /// The constructor code reused between owned and borrowed cases.
425    fn try_new_unstable_internal<D>(
426        provider: &D,
427        prefs: CollatorPreferences,
428        options: CollatorOptions,
429    ) -> Result<Self, DataError>
430    where
431        D: DataProvider<CollationTailoringV1>
432            + DataProvider<CollationDiacriticsV1>
433            + DataProvider<CollationMetadataV1>
434            + DataProvider<CollationReorderingV1>
435            + ?Sized,
436    {
437        let marker_attributes = prefs
438            .collation_type
439            .as_ref()
440            // all collation types are valid marker attributes
441            .map(|c| DataMarkerAttributes::from_str_or_panic(c.as_str()))
442            .unwrap_or_default();
443
444        let data_locale = CollationTailoringV1::make_locale(prefs.locale_preferences);
445        let req = DataRequest {
446            id: DataIdentifierBorrowed::for_marker_attributes_and_locale(
447                marker_attributes,
448                &data_locale,
449            ),
450            metadata: {
451                let mut metadata = DataRequestMetadata::default();
452                metadata.silent = true;
453                metadata
454            },
455        };
456
457        let fallback_req = DataRequest {
458            id: DataIdentifierBorrowed::for_marker_attributes_and_locale(
459                Default::default(),
460                &data_locale,
461            ),
462            ..Default::default()
463        };
464
465        let metadata_payload: DataPayload<crate::provider::CollationMetadataV1> = provider
466            .load(req)
467            .or_else(|_| provider.load(fallback_req))?
468            .payload;
469
470        let metadata = metadata_payload.get();
471
472        let tailoring: Option<DataPayload<crate::provider::CollationTailoringV1>> =
473            if metadata.tailored() {
474                Some(
475                    provider
476                        .load(req)
477                        .or_else(|_| provider.load(fallback_req))?
478                        .payload,
479                )
480            } else {
481                None
482            };
483
484        let reordering: Option<DataPayload<crate::provider::CollationReorderingV1>> =
485            if metadata.reordering() {
486                Some(
487                    provider
488                        .load(req)
489                        .or_else(|_| provider.load(fallback_req))?
490                        .payload,
491                )
492            } else {
493                None
494            };
495
496        if let Some(reordering) = &reordering {
497            if reordering.get().reorder_table.len() != 256 {
498                return Err(DataError::custom("invalid").with_marker(CollationReorderingV1::INFO));
499            }
500        }
501
502        let tailored_diacritics = metadata.tailored_diacritics();
503        let diacritics: DataPayload<CollationDiacriticsV1> = provider
504            .load(if tailored_diacritics {
505                req
506            } else {
507                Default::default()
508            })?
509            .payload;
510
511        if tailored_diacritics {
512            // In the tailored case we accept a shorter table in which case the tailoring is
513            // responsible for supplying the missing values in the trie.
514            // As of June 2022, none of the collations actually use a shortened table.
515            // Vietnamese and Ewe load a full-length alternative table and the rest use
516            // the default one.
517            if diacritics.get().secondaries.len() > OPTIMIZED_DIACRITICS_MAX_COUNT {
518                return Err(DataError::custom("invalid").with_marker(CollationDiacriticsV1::INFO));
519            }
520        } else if diacritics.get().secondaries.len() != OPTIMIZED_DIACRITICS_MAX_COUNT {
521            return Err(DataError::custom("invalid").with_marker(CollationDiacriticsV1::INFO));
522        }
523
524        let mut altered_defaults = CollatorOptionsBitField::default();
525
526        if metadata.alternate_shifted() {
527            altered_defaults.set_alternate_handling(Some(AlternateHandling::Shifted));
528        }
529        if metadata.backward_second_level() {
530            altered_defaults.set_backward_second_level(Some(true));
531        }
532
533        altered_defaults.set_case_first(Some(metadata.case_first()));
534        altered_defaults.set_max_variable(Some(metadata.max_variable()));
535
536        let mut merged_options = CollatorOptionsBitField::from(options);
537        merged_options.set_case_first(prefs.case_first);
538        merged_options.set_numeric_from_enum(prefs.numeric_ordering);
539        merged_options.set_defaults(altered_defaults);
540
541        Ok(LocaleSpecificDataHolder {
542            tailoring,
543            diacritics,
544            merged_options,
545            reordering,
546            lithuanian_dot_above: metadata.lithuanian_dot_above(),
547        })
548    }
549}
550
551/// Compares strings according to culturally-relevant ordering.
552#[derive(Debug)]
553pub struct Collator {
554    special_primaries: DataPayload<ErasedMarker<CollationSpecialPrimariesValidated<'static>>>,
555    root: DataPayload<CollationRootV1>,
556    tailoring: Option<DataPayload<CollationTailoringV1>>,
557    jamo: DataPayload<CollationJamoV1>,
558    diacritics: DataPayload<CollationDiacriticsV1>,
559    options: CollatorOptionsBitField,
560    reordering: Option<DataPayload<CollationReorderingV1>>,
561    decompositions: DataPayload<NormalizerNfdDataV1>,
562    tables: DataPayload<NormalizerNfdTablesV1>,
563    lithuanian_dot_above: bool,
564}
565
566impl Collator {
567    /// Constructs a borrowed version of this type for more efficient querying.
568    pub fn as_borrowed(&self) -> CollatorBorrowed<'_> {
569        CollatorBorrowed {
570            special_primaries: self.special_primaries.get(),
571            root: self.root.get(),
572            tailoring: self.tailoring.as_ref().map(|s| s.get()),
573            jamo: self.jamo.get(),
574            diacritics: self.diacritics.get(),
575            options: self.options,
576            reordering: self.reordering.as_ref().map(|s| s.get()),
577            decompositions: self.decompositions.get(),
578            tables: self.tables.get(),
579            lithuanian_dot_above: self.lithuanian_dot_above,
580        }
581    }
582
583    /// Creates `CollatorBorrowed` for the given locale and options from compiled data.
584    #[cfg(feature = "compiled_data")]
585    pub fn try_new(
586        prefs: CollatorPreferences,
587        options: CollatorOptions,
588    ) -> Result<CollatorBorrowed<'static>, DataError> {
589        CollatorBorrowed::try_new(prefs, options)
590    }
591
592    icu_provider::gen_buffer_data_constructors!(
593        (prefs: CollatorPreferences, options: CollatorOptions) -> error: DataError,
594        functions: [
595            try_new: skip,
596            try_new_with_buffer_provider,
597            try_new_unstable,
598            Self
599        ]
600    );
601
602    #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::try_new)]
603    pub fn try_new_unstable<D>(
604        provider: &D,
605        prefs: CollatorPreferences,
606        options: CollatorOptions,
607    ) -> Result<Self, DataError>
608    where
609        D: DataProvider<CollationSpecialPrimariesV1>
610            + DataProvider<CollationRootV1>
611            + DataProvider<CollationTailoringV1>
612            + DataProvider<CollationDiacriticsV1>
613            + DataProvider<CollationJamoV1>
614            + DataProvider<CollationMetadataV1>
615            + DataProvider<CollationReorderingV1>
616            + DataProvider<NormalizerNfdDataV1>
617            + DataProvider<NormalizerNfdTablesV1>
618            + ?Sized,
619    {
620        Self::try_new_unstable_internal(
621            provider,
622            provider.load(Default::default())?.payload,
623            provider.load(Default::default())?.payload,
624            provider.load(Default::default())?.payload,
625            provider.load(Default::default())?.payload,
626            provider.load(Default::default())?.payload,
627            prefs,
628            options,
629        )
630    }
631
632    #[expect(clippy::too_many_arguments)]
633    fn try_new_unstable_internal<D>(
634        provider: &D,
635        root: DataPayload<CollationRootV1>,
636        decompositions: DataPayload<NormalizerNfdDataV1>,
637        tables: DataPayload<NormalizerNfdTablesV1>,
638        jamo: DataPayload<CollationJamoV1>,
639        special_primaries: DataPayload<CollationSpecialPrimariesV1>,
640        prefs: CollatorPreferences,
641        options: CollatorOptions,
642    ) -> Result<Self, DataError>
643    where
644        D: DataProvider<CollationRootV1>
645            + DataProvider<CollationTailoringV1>
646            + DataProvider<CollationDiacriticsV1>
647            + DataProvider<CollationMetadataV1>
648            + DataProvider<CollationReorderingV1>
649            + ?Sized,
650    {
651        let locale_dependent =
652            LocaleSpecificDataHolder::try_new_unstable_internal(provider, prefs, options)?;
653
654        // TODO: redesign Korean search collation handling
655        if jamo.get().ce32s.len() != JAMO_COUNT {
656            return Err(DataError::custom("invalid").with_marker(CollationJamoV1::INFO));
657        }
658
659        // `variant_count` isn't stable yet:
660        // https://github.com/rust-lang/rust/issues/73662
661        if special_primaries.get().last_primaries.len() <= (MaxVariable::Currency as usize) {
662            return Err(DataError::custom("invalid").with_marker(CollationSpecialPrimariesV1::INFO));
663        }
664        let special_primaries = special_primaries.map_project(|csp, _| {
665            let compressible_bytes = (csp.last_primaries.len()
666                == MaxVariable::Currency as usize + 16)
667                .then(|| {
668                    csp.last_primaries
669                        .as_maybe_borrowed()?
670                        .as_ule_slice()
671                        .get((MaxVariable::Currency as usize)..)?
672                        .try_into()
673                        .ok()
674                })
675                .flatten()
676                .unwrap_or(
677                    CollationSpecialPrimariesValidated::HARDCODED_COMPRESSIBLE_BYTES_FALLBACK,
678                );
679
680            CollationSpecialPrimariesValidated {
681                last_primaries: csp.last_primaries.truncated(MaxVariable::Currency as usize),
682                numeric_primary: csp.numeric_primary,
683                compressible_bytes,
684            }
685        });
686
687        Ok(Collator {
688            special_primaries,
689            root,
690            tailoring: locale_dependent.tailoring,
691            jamo,
692            diacritics: locale_dependent.diacritics,
693            options: locale_dependent.merged_options,
694            reordering: locale_dependent.reordering,
695            decompositions,
696            tables,
697            lithuanian_dot_above: locale_dependent.lithuanian_dot_above,
698        })
699    }
700}
701
702macro_rules! compare {
703    ($(#[$meta:meta])*,
704     $compare:ident,
705     $left_slice:ty,
706     $right_slice:ty,
707     $split_prefix:ident,
708     $left_to_iter:ident,
709     $right_to_iter:ident,
710    ) => {
711        $(#[$meta])*
712        pub fn $compare(&self, left: &$left_slice, right: &$right_slice) -> Ordering {
713            let (head, left_tail, right_tail) = $split_prefix(left, right);
714            if left_tail.is_empty() && right_tail.is_empty() {
715                return Ordering::Equal;
716            }
717            let ret = self.compare_impl(left_tail.$left_to_iter(), right_tail.$right_to_iter(), head.$left_to_iter().rev());
718            if self.options.strength() == Strength::Identical && ret == Ordering::Equal {
719                return Decomposition::new(left_tail.$left_to_iter(), self.decompositions, self.tables).map(|c| if c != MERGE_SEPARATOR { c as i32 } else { -1i32 }).cmp(
720                    Decomposition::new(right_tail.$right_to_iter(), self.decompositions, self.tables).map(|c| if c != MERGE_SEPARATOR { c as i32 } else { -1i32 }),
721                );
722            }
723            ret
724        }
725    }
726}
727
728/// Compares strings according to culturally-relevant ordering,
729/// borrowed version.
730#[derive(Debug)]
731pub struct CollatorBorrowed<'a> {
732    special_primaries: &'a CollationSpecialPrimariesValidated<'a>,
733    root: &'a CollationData<'a>,
734    tailoring: Option<&'a CollationData<'a>>,
735    jamo: &'a CollationJamo<'a>,
736    diacritics: &'a CollationDiacritics<'a>,
737    options: CollatorOptionsBitField,
738    reordering: Option<&'a CollationReordering<'a>>,
739    decompositions: &'a DecompositionData<'a>,
740    tables: &'a DecompositionTables<'a>,
741    lithuanian_dot_above: bool,
742}
743
744impl CollatorBorrowed<'static> {
745    /// Creates a collator for the given locale and options from compiled data.
746    #[cfg(feature = "compiled_data")]
747    pub fn try_new(
748        prefs: CollatorPreferences,
749        options: CollatorOptions,
750    ) -> Result<Self, DataError> {
751        // These are assigned to locals in order to keep the code after these assignments
752        // copypaste-compatible with `Collator::try_new_unstable_internal`.
753
754        let provider = &crate::provider::Baked;
755        let decompositions = icu_normalizer::provider::Baked::SINGLETON_NORMALIZER_NFD_DATA_V1;
756        let tables = icu_normalizer::provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1;
757        let root = crate::provider::Baked::SINGLETON_COLLATION_ROOT_V1;
758        let jamo = crate::provider::Baked::SINGLETON_COLLATION_JAMO_V1;
759
760        let locale_dependent =
761            LocaleSpecificDataHolder::try_new_unstable_internal(provider, prefs, options)?;
762
763        // TODO: redesign Korean search collation handling
764        const _: () = assert!(
765            crate::provider::Baked::SINGLETON_COLLATION_JAMO_V1
766                .ce32s
767                .as_slice()
768                .len()
769                == JAMO_COUNT
770        );
771
772        // `variant_count` isn't stable yet:
773        // https://github.com/rust-lang/rust/issues/73662
774        const _: () = assert!(
775            crate::provider::Baked::SINGLETON_COLLATION_SPECIAL_PRIMARIES_V1
776                .last_primaries
777                .as_slice()
778                .len()
779                > (MaxVariable::Currency as usize)
780        );
781
782        let special_primaries = const {
783            &CollationSpecialPrimariesValidated {
784                last_primaries: zerovec::ZeroSlice::from_ule_slice(
785                    crate::provider::Baked::SINGLETON_COLLATION_SPECIAL_PRIMARIES_V1
786                        .last_primaries
787                        .as_slice()
788                        .as_ule_slice()
789                        .split_at(MaxVariable::Currency as usize)
790                        .0,
791                )
792                .as_zerovec(),
793                numeric_primary: crate::provider::Baked::SINGLETON_COLLATION_SPECIAL_PRIMARIES_V1
794                    .numeric_primary,
795                compressible_bytes: {
796                    const C: &[<u16 as AsULE>::ULE] =
797                        crate::provider::Baked::SINGLETON_COLLATION_SPECIAL_PRIMARIES_V1
798                            .last_primaries
799                            .as_slice()
800                            .as_ule_slice();
801                    if C.len() == MaxVariable::Currency as usize + 16 {
802                        let i = MaxVariable::Currency as usize;
803                        #[allow(clippy::indexing_slicing)] // protected, const
804                        &[
805                            C[i],
806                            C[i + 1],
807                            C[i + 2],
808                            C[i + 3],
809                            C[i + 4],
810                            C[i + 5],
811                            C[i + 6],
812                            C[i + 7],
813                            C[i + 8],
814                            C[i + 9],
815                            C[i + 10],
816                            C[i + 11],
817                            C[i + 12],
818                            C[i + 13],
819                            C[i + 14],
820                            C[i + 15],
821                        ]
822                    } else {
823                        CollationSpecialPrimariesValidated::HARDCODED_COMPRESSIBLE_BYTES_FALLBACK
824                    }
825                },
826            }
827        };
828
829        // Attribute belongs closer to `unwrap`, but
830        // https://github.com/rust-lang/rust/issues/15701
831        #[expect(clippy::unwrap_used)]
832        Ok(CollatorBorrowed {
833            special_primaries,
834            root,
835            // Unwrap is OK, because we know we have the baked provider.
836            tailoring: locale_dependent.tailoring.map(|s| s.get_static().unwrap()),
837            jamo,
838            // Unwrap is OK, because we know we have the baked provider.
839            diacritics: locale_dependent.diacritics.get_static().unwrap(),
840            options: locale_dependent.merged_options,
841            // Unwrap is OK, because we know we have the baked provider.
842            reordering: locale_dependent.reordering.map(|s| s.get_static().unwrap()),
843            decompositions,
844            tables,
845            lithuanian_dot_above: locale_dependent.lithuanian_dot_above,
846        })
847    }
848
849    /// Cheaply converts a [`CollatorBorrowed<'static>`] into a [`Collator`].
850    ///
851    /// Note: Due to branching and indirection, using [`Collator`] might inhibit some
852    /// compile-time optimizations that are possible with [`CollatorBorrowed`].
853    pub const fn static_to_owned(self) -> Collator {
854        Collator {
855            special_primaries: DataPayload::from_static_ref(self.special_primaries),
856            root: DataPayload::from_static_ref(self.root),
857            tailoring: if let Some(s) = self.tailoring {
858                // `map` not available in const context
859                Some(DataPayload::from_static_ref(s))
860            } else {
861                None
862            },
863            jamo: DataPayload::from_static_ref(self.jamo),
864            diacritics: DataPayload::from_static_ref(self.diacritics),
865            options: self.options,
866            reordering: if let Some(s) = self.reordering {
867                // `map` not available in const context
868                Some(DataPayload::from_static_ref(s))
869            } else {
870                None
871            },
872            decompositions: DataPayload::from_static_ref(self.decompositions),
873            tables: DataPayload::from_static_ref(self.tables),
874            lithuanian_dot_above: self.lithuanian_dot_above,
875        }
876    }
877}
878
879macro_rules! collation_elements {
880    ($self:expr, $chars:expr, $tailoring:expr, $numeric_primary:expr) => {{
881        let jamo = <&[<u32 as AsULE>::ULE; JAMO_COUNT]>::try_from($self.jamo.ce32s.as_ule_slice());
882
883        let jamo = jamo.unwrap();
884
885        CollationElements::new(
886            $chars,
887            $self.root,
888            $tailoring,
889            jamo,
890            &$self.diacritics.secondaries,
891            $self.decompositions,
892            $self.tables,
893            $numeric_primary,
894            $self.lithuanian_dot_above,
895        )
896    }};
897}
898
899impl CollatorBorrowed<'_> {
900    /// The resolved options showing how the default options, the requested options,
901    /// and the options from locale data were combined.
902    pub fn resolved_options(&self) -> ResolvedCollatorOptions {
903        self.options.into()
904    }
905
906    compare!(
907        /// Compare guaranteed well-formed UTF-8 slices.
908        ,
909        compare,
910        str,
911        str,
912        split_prefix,
913        chars,
914        chars,
915    );
916
917    compare!(
918        /// Compare potentially ill-formed UTF-8 slices. Ill-formed input is compared
919        /// as if errors had been replaced with REPLACEMENT CHARACTERs according
920        /// to the WHATWG Encoding Standard.
921        ,
922        compare_utf8,
923        [u8],
924        [u8],
925        split_prefix_u8,
926        chars,
927        chars,
928    );
929
930    compare!(
931        /// Compare potentially ill-formed UTF-16 slices. Unpaired surrogates
932        /// are compared as if each one was a REPLACEMENT CHARACTER.
933        ,
934        compare_utf16,
935        [u16],
936        [u16],
937        split_prefix_u16,
938        chars,
939        chars,
940    );
941
942    compare!(
943        /// Compare Latin1 slices.
944        ///
945        /// ✨ *Enabled with the `latin1` Cargo feature.*
946        #[cfg(feature = "latin1")]
947        ,
948        compare_latin1,
949        [u8],
950        [u8],
951        split_prefix_latin1,
952        latin1_chars,
953        latin1_chars,
954    );
955
956    compare!(
957        /// Compare Latin1 slice with potentially ill-formed UTF-16
958        /// slice.
959        ///
960        /// If you need to compare a potentially ill-formed UTF-16
961        /// slice with a Latin1 slice, swap the arguments and
962        /// call `reverse()` on the return value.
963        ///
964        /// ✨ *Enabled with the `latin1` Cargo feature.*
965        #[cfg(feature = "latin1")]
966        ,
967        compare_latin1_utf16,
968        [u8],
969        [u16],
970        split_prefix_latin1_utf16,
971        latin1_chars,
972        chars,
973    );
974
975    #[inline(always)]
976    fn tailoring_or_root(&self) -> &CollationData<'_> {
977        if let Some(tailoring) = &self.tailoring {
978            tailoring
979        } else {
980            // If the root collation is valid for the locale,
981            // use the root as the tailoring so that reads from the
982            // tailoring always succeed.
983            //
984            // TODO(#2011): Do we instead want to have an untailored
985            // copypaste of the iterator that omits the tailoring
986            // branches for performance at the expense of code size
987            // and having to maintain both a tailoring-capable and
988            // a tailoring-incapable version of the iterator?
989            // Or, in order not to flip the branch prediction around,
990            // should we have a no-op tailoring that contains a
991            // specially-crafted CodePointTrie that always returns
992            // a FALLBACK_CE32 after a single branch?
993            self.root
994        }
995    }
996
997    #[inline(always)]
998    fn numeric_primary(&self) -> Option<u8> {
999        if self.options.numeric() {
1000            Some(self.special_primaries.numeric_primary)
1001        } else {
1002            None
1003        }
1004    }
1005
1006    #[inline(always)]
1007    fn variable_top(&self) -> u32 {
1008        if self.options.alternate_handling() == AlternateHandling::NonIgnorable {
1009            0
1010        } else {
1011            // +1 so that we can use "<" and primary ignorables test out early.
1012            self.special_primaries
1013                .last_primary_for_group(self.options.max_variable())
1014                + 1
1015        }
1016    }
1017
1018    /// The implementation of the comparison operation.
1019    ///
1020    /// `head_chars` is an iterator _backward_ over the identical
1021    /// prefix and `left_chars` and `right_chars` are iterators
1022    /// _forward_ over the parts after the identical prefix.
1023    fn compare_impl<
1024        L: Iterator<Item = char>,
1025        R: Iterator<Item = char>,
1026        H: Iterator<Item = char>,
1027    >(
1028        &self,
1029        left_chars: L,
1030        right_chars: R,
1031        mut head_chars: H,
1032    ) -> Ordering {
1033        // Sadly, it looks like variable CEs and backward second level
1034        // require us to store the full 64-bit CEs instead of storing only
1035        // the NonPrimary part.
1036        //
1037        // TODO(#2008): Consider having two monomorphizations of this method:
1038        // one that can deal with variables shifted to quaternary and
1039        // backward second level and another that doesn't support that
1040        // and only stores `NonPrimary` in `left_ces` and `right_ces`
1041        // with double the number of stack allocated elements.
1042
1043        // Note: These are used only after the identical prefix skipping,
1044        // but initializing these up here improves performance at the time
1045        // of writing. Presumably the source order affects the stack frame
1046        // layout.
1047        let mut left_ces: SmallVec<[CollationElement; CE_BUFFER_SIZE]> = SmallVec::new();
1048        let mut right_ces: SmallVec<[CollationElement; CE_BUFFER_SIZE]> = SmallVec::new();
1049
1050        // The algorithm comes from CollationCompare::compareUpToQuaternary in ICU4C.
1051
1052        let mut any_variable = false;
1053        let variable_top = self.variable_top();
1054
1055        let tailoring = self.tailoring_or_root();
1056        let numeric_primary = self.numeric_primary();
1057        let mut left = collation_elements!(self, left_chars, tailoring, numeric_primary);
1058        let mut right = collation_elements!(self, right_chars, tailoring, numeric_primary);
1059
1060        // Start identical prefix
1061
1062        // The logic here to check whether the boundary found by skipping
1063        // the identical prefix is safe is complicated compared to the ICU4C
1064        // approach of having a set of characters that are unsafe as the character
1065        // immediately following the identical prefix. However, the approach here
1066        // avoids extra data, and working on the main data avoids the bug
1067        // possibility of data structures not being mutually consistent.
1068
1069        // This code intentionally does not keep around the `CollationElement32`s
1070        // that have been read from the collation data tries, because keeping
1071        // them around turned out to be a pessimization: There would be added
1072        // branches on the hot path of the algorithm that maps characters to
1073        // collation elements, and the element size of the upcoming buffer
1074        // would grow.
1075        //
1076        // However, the values read from the normalization trie _are_ kept around,
1077        // since there is already a place where to put them.
1078
1079        // This loop is only broken out of as goto forward.
1080        #[expect(clippy::never_loop)]
1081        'prefix: loop {
1082            if let Some(mut head_last_c) = head_chars.next() {
1083                let norm_trie = &self.decompositions.trie;
1084                let mut head_last = CharacterAndClassAndTrieValue::new_with_trie_val(
1085                    head_last_c,
1086                    norm_trie.get(head_last_c),
1087                );
1088                let mut head_last_ce32 = CollationElement32::default();
1089                let mut head_last_ok = false;
1090                if let Some(left_different) = left.iter_next_before_init() {
1091                    left.prepend_upcoming_before_init(left_different.clone());
1092                    if let Some(right_different) = right.iter_next_before_init() {
1093                        // Note: left_different and right_different may both be U+FFFD.
1094                        right.prepend_upcoming_before_init(right_different.clone());
1095
1096                        // The base logic is that a boundary between two starters
1097                        // that decompose to selves is safe iff the starter
1098                        // before the boundary can't contract a starter, the
1099                        // starter after the boundary doesn't have a prefix
1100                        // condition, and, with the numeric mode enabled,
1101                        // they aren't both numeric.
1102                        //
1103                        // This base logic is then extended with Hangul
1104                        // syllables and characters that decompose to a
1105                        // BMP starter followed by a BMP non-starter.
1106                        // The logic could be extended further, in
1107                        // particular to cover singleton decompositions
1108                        // to a BMP starter, but such characters can be
1109                        // expected to be rare enough in real-world input
1110                        // that it's not worthwhile to make this code more
1111                        // branchy.
1112                        //
1113                        // A Hangul syllable is safe on either side of the
1114                        // boundary, because Hangul syllables can't participate
1115                        // in contraction or have prefix conditions. They are
1116                        // also known not to be numeric.
1117                        //
1118                        // Hangul jamo is safe to look up from the main trie
1119                        // instead of the jamo table, because they aren't
1120                        // allowed to participate in contractions or prefix
1121                        // conditions, either, and are known not to be numeric.
1122                        //
1123                        // After a boundary, a decomposition to a BMP starter
1124                        // and a BMP non-starter can obviously be analyzed by
1125                        // considering the starter as if it was a starter
1126                        // that decomposes to self.
1127                        //
1128                        // Before a boundary the contraction condition considers
1129                        // whether the contraction can contract a starter.
1130                        // For the case of contracting a non-starter, it's
1131                        // fine for the BMP starter of the decomposition to
1132                        // contract the non-starter from the same decomposition:
1133                        // Since that would happen as part of the prefix that
1134                        // is identical, it wouldn't affect anything after.
1135                        //
1136                        // The case of contracting a non-starter other than
1137                        // the one that came from the decomposition itself
1138                        // is irrelevant, because we don't allow a non-starter
1139                        // right after the boundary regardless of the contraction
1140                        // status of what's before the boundary.
1141                        //
1142                        // Finally, decompositions to starter and non-starter
1143                        // are known not to be numeric.
1144
1145                        // The checks below are repetitive, but an attempt to factor
1146                        // repetitive code into an inlined function regressed,
1147                        // performance, so it seems that having the control flow
1148                        // right here without an intermediate enum from a
1149                        // function return to branch on is important.
1150
1151                        // This loop is only broken out of as goto forward. The control flow
1152                        // is much more readable this way.
1153                        #[expect(clippy::never_loop)]
1154                        loop {
1155                            // The two highest bits are about NFC, which we don't
1156                            // care about here.
1157                            let decomposition = head_last.trie_val;
1158                            if (decomposition
1159                                & !(BACKWARD_COMBINING_MARKER | NON_ROUND_TRIP_MARKER))
1160                                == 0
1161                            {
1162                                // Intentionally empty block to keep
1163                                // the same structure as in the cases
1164                                // where something happens here.
1165                            } else if ((decomposition & HIGH_ZEROS_MASK) != 0)
1166                                && ((decomposition & LOW_ZEROS_MASK) != 0)
1167                            {
1168                                // Decomposition into two BMP characters: starter and non-starter
1169                                // Let's take the starter
1170                                head_last_c = char_from_u32(decomposition & 0x7FFF);
1171                            } else if decomposition == HANGUL_SYLLABLE_MARKER {
1172                                head_last_ce32 = FFFD_CE32;
1173                            } else {
1174                                break;
1175                            }
1176                            head_last_ok = true;
1177
1178                            let mut left_ce32 = CollationElement32::default();
1179                            let mut right_ce32 = CollationElement32::default();
1180                            let left_c;
1181                            let right_c;
1182
1183                            let decomposition = left_different.trie_val;
1184                            if (decomposition
1185                                & !(BACKWARD_COMBINING_MARKER | NON_ROUND_TRIP_MARKER))
1186                                == 0
1187                            {
1188                                left_c = left_different.character();
1189                            } else if ((decomposition & HIGH_ZEROS_MASK) != 0)
1190                                && ((decomposition & LOW_ZEROS_MASK) != 0)
1191                            {
1192                                // Decomposition into two BMP characters: starter and non-starter
1193                                // Let's take the starter
1194                                left_c = char_from_u32(decomposition & 0x7FFF);
1195                            } else if decomposition == HANGUL_SYLLABLE_MARKER {
1196                                left_c = '\u{0}';
1197                                left_ce32 = FFFD_CE32;
1198                            } else {
1199                                break;
1200                            }
1201
1202                            let decomposition = right_different.trie_val;
1203                            if (decomposition
1204                                & !(BACKWARD_COMBINING_MARKER | NON_ROUND_TRIP_MARKER))
1205                                == 0
1206                            {
1207                                right_c = right_different.character();
1208                            } else if ((decomposition & HIGH_ZEROS_MASK) != 0)
1209                                && ((decomposition & LOW_ZEROS_MASK) != 0)
1210                            {
1211                                // Decomposition into two BMP characters: starter and non-starter
1212                                // Let's take the starter
1213                                right_c = char_from_u32(decomposition & 0x7FFF);
1214                            } else if decomposition == HANGUL_SYLLABLE_MARKER {
1215                                right_c = '\u{0}';
1216                                right_ce32 = FFFD_CE32;
1217                            } else {
1218                                break;
1219                            }
1220
1221                            // The last character of the prefix is OK on the normalization
1222                            // level. Now let's check its ce32 unless it's a Hangul syllable,
1223                            // in which case `head_last_ce32` already is a non-default placeholder.
1224                            if head_last_ce32 == CollationElement32::default() {
1225                                head_last_ce32 = tailoring.ce32_for_char(head_last_c);
1226                                if head_last_ce32 == FALLBACK_CE32 {
1227                                    head_last_ce32 = self.root.ce32_for_char(head_last_c);
1228                                }
1229                                if head_last_ce32.tag_checked() == Some(Tag::Contraction)
1230                                    && head_last_ce32.at_least_one_suffix_contains_starter()
1231                                {
1232                                    break;
1233                                }
1234                            }
1235                            // The first character of each suffix is OK on the normalization
1236                            // level. Now let's check their ce32s unless they are Hangul syllables.
1237                            if left_ce32 == CollationElement32::default() {
1238                                left_ce32 = tailoring.ce32_for_char(left_c);
1239                                if left_ce32 == FALLBACK_CE32 {
1240                                    left_ce32 = self.root.ce32_for_char(left_c);
1241                                }
1242                                if left_ce32.tag_checked() == Some(Tag::Prefix) {
1243                                    break;
1244                                }
1245                            }
1246                            if right_ce32 == CollationElement32::default() {
1247                                right_ce32 = tailoring.ce32_for_char(right_c);
1248                                if right_ce32 == FALLBACK_CE32 {
1249                                    right_ce32 = self.root.ce32_for_char(right_c);
1250                                }
1251                                if right_ce32.tag_checked() == Some(Tag::Prefix) {
1252                                    break;
1253                                }
1254                            }
1255                            if self.options.numeric()
1256                                && head_last_ce32.tag_checked() == Some(Tag::Digit)
1257                                && (left_ce32.tag_checked() == Some(Tag::Digit)
1258                                    || right_ce32.tag_checked() == Some(Tag::Digit))
1259                            {
1260                                break;
1261                            }
1262                            // We are at a good boundary!
1263                            break 'prefix;
1264                        }
1265                    }
1266                }
1267                let mut tail_first_c;
1268                let mut tail_first_ce32;
1269                let mut tail_first_ok;
1270                loop {
1271                    // Take a step back.
1272                    left.prepend_upcoming_before_init(head_last.clone());
1273                    right.prepend_upcoming_before_init(head_last.clone());
1274
1275                    tail_first_c = head_last_c;
1276                    tail_first_ce32 = head_last_ce32;
1277                    tail_first_ok = head_last_ok;
1278
1279                    head_last_c = if let Some(head_last_c) = head_chars.next() {
1280                        head_last_c
1281                    } else {
1282                        // We need to step back beyond the start of the prefix.
1283                        // Treat as good boundary.
1284                        break 'prefix;
1285                    };
1286                    let decomposition = norm_trie.get(head_last_c);
1287                    head_last = CharacterAndClassAndTrieValue::new_with_trie_val(
1288                        head_last_c,
1289                        decomposition,
1290                    );
1291                    head_last_ce32 = CollationElement32::default();
1292                    head_last_ok = false;
1293
1294                    if (decomposition & !(BACKWARD_COMBINING_MARKER | NON_ROUND_TRIP_MARKER)) == 0 {
1295                        // Intentionally empty block to keep
1296                        // the same structure as in the cases
1297                        // where something happens here.
1298                    } else if ((decomposition & HIGH_ZEROS_MASK) != 0)
1299                        && ((decomposition & LOW_ZEROS_MASK) != 0)
1300                    {
1301                        // Decomposition into two BMP characters: starter and non-starter
1302                        // Let's take the starter
1303                        head_last_c = char_from_u32(decomposition & 0x7FFF);
1304                    } else if decomposition == HANGUL_SYLLABLE_MARKER {
1305                        head_last_ce32 = FFFD_CE32;
1306                    } else {
1307                        continue;
1308                    }
1309                    head_last_ok = true;
1310                    if !tail_first_ok {
1311                        continue;
1312                    }
1313                    // The last character of the prefix is OK on the normalization
1314                    // level. Now let's check its ce32 unless it's a Hangul syllable.
1315                    if head_last_ce32 == CollationElement32::default() {
1316                        head_last_ce32 = tailoring.ce32_for_char(head_last_c);
1317                        if head_last_ce32 == FALLBACK_CE32 {
1318                            head_last_ce32 = self.root.ce32_for_char(head_last_c);
1319                        }
1320                        if head_last_ce32.tag_checked() == Some(Tag::Contraction)
1321                            && head_last_ce32.at_least_one_suffix_contains_starter()
1322                        {
1323                            continue;
1324                        }
1325                    }
1326                    // Check this _after_ `head_last_ce32` to make sure
1327                    // `head_last_ce32` is initialized for the next loop round
1328                    // trip if applicable.
1329                    if tail_first_ce32 == CollationElement32::default() {
1330                        tail_first_ce32 = tailoring.ce32_for_char(tail_first_c);
1331                        if tail_first_ce32 == FALLBACK_CE32 {
1332                            tail_first_ce32 = self.root.ce32_for_char(tail_first_c);
1333                        }
1334                    } // else we already have a trie value from the previous loop iteration or we have Hangul syllable
1335                    if tail_first_ce32.tag_checked() == Some(Tag::Prefix) {
1336                        continue;
1337                    }
1338                    if self.options.numeric()
1339                        && head_last_ce32.tag_checked() == Some(Tag::Digit)
1340                        && tail_first_ce32.tag_checked() == Some(Tag::Digit)
1341                    {
1342                        continue;
1343                    }
1344                    // We are at a good boundary!
1345                    break 'prefix;
1346                }
1347            } else {
1348                // The prefix is empty
1349                break 'prefix;
1350            }
1351            // Unreachable line
1352        }
1353
1354        // End identical prefix
1355
1356        left.init();
1357        right.init();
1358
1359        loop {
1360            let mut left_primary;
1361            'left_primary_loop: loop {
1362                let ce = left.next();
1363                left_primary = ce.primary();
1364                // TODO(#2008): Consider compiling out the variable handling when we know we aren't
1365                // shifting variable CEs.
1366                if !(left_primary < variable_top && left_primary > MERGE_SEPARATOR_PRIMARY) {
1367                    left_ces.push(ce);
1368                } else {
1369                    // Variable CE, shift it to quaternary level.
1370                    // Ignore all following primary ignorables, and shift further variable CEs.
1371                    any_variable = true;
1372                    // Relative to ICU4C, the next line is hoisted out of the following loop
1373                    // in order to keep the variables called `ce` immutable to make it easier
1374                    // to reason about each assignment into `ce` resulting in exactly a single
1375                    // push into `left_ces`.
1376                    left_ces.push(ce.clone_with_non_primary_zeroed());
1377                    loop {
1378                        // This loop is simpler than in ICU4C; unlike in C++, we get to break by label.
1379                        let ce = left.next();
1380                        left_primary = ce.primary();
1381                        if left_primary != 0
1382                            && !(left_primary < variable_top
1383                                && left_primary > MERGE_SEPARATOR_PRIMARY)
1384                        {
1385                            // Neither a primary ignorable nor a variable CE.
1386                            left_ces.push(ce);
1387                            break 'left_primary_loop;
1388                        }
1389                        // If `left_primary == 0`, the following line ignores a primary-ignorable.
1390                        // Otherwise, it shifts a variable CE.
1391                        left_ces.push(ce.clone_with_non_primary_zeroed());
1392                    }
1393                }
1394                if left_primary != 0 {
1395                    break;
1396                }
1397            }
1398            let mut right_primary;
1399            'right_primary_loop: loop {
1400                let ce = right.next();
1401                right_primary = ce.primary();
1402                // TODO(#2008): Consider compiling out the variable handling when we know we aren't
1403                // shifting variable CEs.
1404                if !(right_primary < variable_top && right_primary > MERGE_SEPARATOR_PRIMARY) {
1405                    right_ces.push(ce);
1406                } else {
1407                    // Variable CE, shift it to quaternary level.
1408                    // Ignore all following primary ignorables, and shift further variable CEs.
1409                    any_variable = true;
1410                    // Relative to ICU4C, the next line is hoisted out of the following loop
1411                    // in order to keep the variables called `ce` immutable to make it easier
1412                    // to reason about each assignment into `ce` resulting in exactly a single
1413                    // push into `right_ces`.
1414                    right_ces.push(ce.clone_with_non_primary_zeroed());
1415                    loop {
1416                        // This loop is simpler than in ICU4C; unlike in C++, we get to break by label.
1417                        let ce = right.next();
1418                        right_primary = ce.primary();
1419                        if right_primary != 0
1420                            && !(right_primary < variable_top
1421                                && right_primary > MERGE_SEPARATOR_PRIMARY)
1422                        {
1423                            // Neither a primary ignorable nor a variable CE.
1424                            right_ces.push(ce);
1425                            break 'right_primary_loop;
1426                        }
1427                        // If `right_primary == 0`, the following line ignores a primary-ignorable.
1428                        // Otherwise, it shifts a variable CE.
1429                        right_ces.push(ce.clone_with_non_primary_zeroed());
1430                    }
1431                }
1432                if right_primary != 0 {
1433                    break;
1434                }
1435            }
1436            if left_primary != right_primary {
1437                if let Some(reordering) = &self.reordering {
1438                    left_primary = reordering.reorder(left_primary);
1439                    right_primary = reordering.reorder(right_primary);
1440                }
1441                if left_primary < right_primary {
1442                    return Ordering::Less;
1443                }
1444                return Ordering::Greater;
1445            }
1446            if left_primary == NO_CE_PRIMARY {
1447                break;
1448            }
1449        }
1450
1451        // Sadly, we end up pushing the sentinel value, which means these
1452        // `SmallVec`s allocate more often than if we didn't actually
1453        // store the sentinel.
1454        debug_assert_eq!(left_ces.last(), Some(&NO_CE));
1455        debug_assert_eq!(right_ces.last(), Some(&NO_CE));
1456
1457        // Note: `unwrap_or_default` in the iterations below should never
1458        // actually end up using the "_or_default" part, because the sentinel
1459        // is in the `SmallVec`s. These could be changed to `unwrap()` if we
1460        // preferred panic in case of a bug.
1461        // TODO(#2009): Should we save one slot by not putting the sentinel in
1462        // the `SmallVec`s? So far, the answer seems "no", as it would complicate
1463        // the primary comparison above.
1464
1465        // Compare the buffered secondary & tertiary weights.
1466        // We might skip the secondary level but continue with the case level
1467        // which is turned on separately.
1468        if self.options.strength() >= Strength::Secondary {
1469            if !self.options.backward_second_level() {
1470                let mut left_iter = left_ces.iter();
1471                let mut right_iter = right_ces.iter();
1472                let mut left_secondary;
1473                let mut right_secondary;
1474                loop {
1475                    loop {
1476                        left_secondary = left_iter.next().unwrap_or_default().secondary();
1477                        if left_secondary != 0 {
1478                            break;
1479                        }
1480                    }
1481                    loop {
1482                        right_secondary = right_iter.next().unwrap_or_default().secondary();
1483                        if right_secondary != 0 {
1484                            break;
1485                        }
1486                    }
1487                    if left_secondary != right_secondary {
1488                        if left_secondary < right_secondary {
1489                            return Ordering::Less;
1490                        }
1491                        return Ordering::Greater;
1492                    }
1493                    if left_secondary == NO_CE_SECONDARY {
1494                        break;
1495                    }
1496                }
1497            } else {
1498                let mut left_remaining = &left_ces[..];
1499                let mut right_remaining = &right_ces[..];
1500                loop {
1501                    if left_remaining.is_empty() {
1502                        debug_assert!(right_remaining.is_empty());
1503                        break;
1504                    }
1505                    let (left_prefix, right_prefix) = {
1506                        let mut left_iter = left_remaining.iter();
1507                        loop {
1508                            let left_primary = left_iter.next().unwrap_or_default().primary();
1509                            if left_primary != 0 && left_primary <= MERGE_SEPARATOR_PRIMARY {
1510                                break;
1511                            }
1512                            debug_assert_ne!(left_primary, NO_CE_PRIMARY);
1513                        }
1514                        let left_new_remaining = left_iter.as_slice();
1515                        // Index in range by construction
1516                        #[expect(clippy::indexing_slicing)]
1517                        let left_prefix =
1518                            &left_remaining[..left_remaining.len() - 1 - left_new_remaining.len()];
1519                        left_remaining = left_new_remaining;
1520
1521                        let mut right_iter = right_remaining.iter();
1522                        loop {
1523                            let right_primary = right_iter.next().unwrap_or_default().primary();
1524                            if right_primary != 0 && right_primary <= MERGE_SEPARATOR_PRIMARY {
1525                                break;
1526                            }
1527                            debug_assert_ne!(right_primary, NO_CE_PRIMARY);
1528                        }
1529                        let right_new_remaining = right_iter.as_slice();
1530                        // Index in range by construction
1531                        #[expect(clippy::indexing_slicing)]
1532                        let right_prefix = &right_remaining
1533                            [..right_remaining.len() - 1 - right_new_remaining.len()];
1534                        right_remaining = right_new_remaining;
1535
1536                        (left_prefix, right_prefix)
1537                    };
1538                    let mut left_iter = left_prefix.iter();
1539                    let mut right_iter = right_prefix.iter();
1540
1541                    let mut left_secondary;
1542                    let mut right_secondary;
1543                    loop {
1544                        loop {
1545                            left_secondary = left_iter.next_back().unwrap_or_default().secondary();
1546                            if left_secondary != 0 {
1547                                break;
1548                            }
1549                        }
1550                        loop {
1551                            right_secondary =
1552                                right_iter.next_back().unwrap_or_default().secondary();
1553                            if right_secondary != 0 {
1554                                break;
1555                            }
1556                        }
1557                        if left_secondary != right_secondary {
1558                            if left_secondary < right_secondary {
1559                                return Ordering::Less;
1560                            }
1561                            return Ordering::Greater;
1562                        }
1563                        if left_secondary == NO_CE_SECONDARY {
1564                            break;
1565                        }
1566                    }
1567                }
1568            }
1569        }
1570
1571        if self.options.case_level() {
1572            if self.options.strength() == Strength::Primary {
1573                // Primary+caseLevel: Ignore case level weights of primary ignorables.
1574                // Otherwise we would get a-umlaut > a
1575                // which is not desirable for accent-insensitive sorting.
1576                // Check for (lower 32 bits) == 0 as well because variable CEs are stored
1577                // with only primary weights.
1578                let mut left_non_primary;
1579                let mut right_non_primary;
1580                let mut left_case;
1581                let mut right_case;
1582                let mut left_iter = left_ces.iter();
1583                let mut right_iter = right_ces.iter();
1584                loop {
1585                    loop {
1586                        let ce = left_iter.next().unwrap_or_default();
1587                        left_non_primary = ce.non_primary();
1588                        if !ce.either_half_zero() {
1589                            break;
1590                        }
1591                    }
1592                    left_case = left_non_primary.case();
1593                    loop {
1594                        let ce = right_iter.next().unwrap_or_default();
1595                        right_non_primary = ce.non_primary();
1596                        if !ce.either_half_zero() {
1597                            break;
1598                        }
1599                    }
1600                    right_case = right_non_primary.case();
1601                    // No need to handle NO_CE and MERGE_SEPARATOR specially:
1602                    // There is one case weight for each previous-level weight,
1603                    // so level length differences were handled there.
1604                    if left_case != right_case {
1605                        if !self.options.upper_first() {
1606                            if left_case < right_case {
1607                                return Ordering::Less;
1608                            }
1609                            return Ordering::Greater;
1610                        }
1611                        if left_case < right_case {
1612                            return Ordering::Greater;
1613                        }
1614                        return Ordering::Less;
1615                    }
1616                    if left_non_primary.secondary() == NO_CE_SECONDARY {
1617                        break;
1618                    }
1619                }
1620            } else {
1621                // Secondary+caseLevel: By analogy with the above,
1622                // ignore case level weights of secondary ignorables.
1623                //
1624                // Note: A tertiary CE has uppercase case bits (0.0.ut)
1625                // to keep tertiary+caseFirst well-formed.
1626                //
1627                // Tertiary+caseLevel: Also ignore case level weights of secondary ignorables.
1628                // Otherwise a tertiary CE's uppercase would be no greater than
1629                // a primary/secondary CE's uppercase.
1630                // (See UCA well-formedness condition 2.)
1631                // We could construct a special case weight higher than uppercase,
1632                // but it's simpler to always ignore case weights of secondary ignorables,
1633                // turning 0.0.ut into 0.0.0.t.
1634                // (See LDML Collation, Case Parameters.)
1635                let mut left_non_primary;
1636                let mut right_non_primary;
1637                let mut left_case;
1638                let mut right_case;
1639                let mut left_iter = left_ces.iter();
1640                let mut right_iter = right_ces.iter();
1641                loop {
1642                    loop {
1643                        left_non_primary = left_iter.next().unwrap_or_default().non_primary();
1644                        if left_non_primary.secondary() != 0 {
1645                            break;
1646                        }
1647                    }
1648                    left_case = left_non_primary.case();
1649                    loop {
1650                        right_non_primary = right_iter.next().unwrap_or_default().non_primary();
1651                        if right_non_primary.secondary() != 0 {
1652                            break;
1653                        }
1654                    }
1655                    right_case = right_non_primary.case();
1656                    // No need to handle NO_CE and MERGE_SEPARATOR specially:
1657                    // There is one case weight for each previous-level weight,
1658                    // so level length differences were handled there.
1659                    if left_case != right_case {
1660                        if !self.options.upper_first() {
1661                            if left_case < right_case {
1662                                return Ordering::Less;
1663                            }
1664                            return Ordering::Greater;
1665                        }
1666                        if left_case < right_case {
1667                            return Ordering::Greater;
1668                        }
1669                        return Ordering::Less;
1670                    }
1671                    if left_non_primary.secondary() == NO_CE_SECONDARY {
1672                        break;
1673                    }
1674                }
1675            }
1676        }
1677
1678        if let Some(tertiary_mask) = self.options.tertiary_mask() {
1679            let mut any_quaternaries = AnyQuaternaryAccumulator::new();
1680            let mut left_iter = left_ces.iter();
1681            let mut right_iter = right_ces.iter();
1682            loop {
1683                let mut left_non_primary;
1684                let mut left_tertiary;
1685                loop {
1686                    left_non_primary = left_iter.next().unwrap_or_default().non_primary();
1687                    any_quaternaries.accumulate(left_non_primary);
1688                    debug_assert!(
1689                        left_non_primary.tertiary() != 0 || left_non_primary.case_quaternary() == 0
1690                    );
1691                    left_tertiary = left_non_primary.tertiary_case_quarternary(tertiary_mask);
1692                    if left_tertiary != 0 {
1693                        break;
1694                    }
1695                }
1696
1697                let mut right_non_primary;
1698                let mut right_tertiary;
1699                loop {
1700                    right_non_primary = right_iter.next().unwrap_or_default().non_primary();
1701                    any_quaternaries.accumulate(right_non_primary);
1702                    debug_assert!(
1703                        right_non_primary.tertiary() != 0
1704                            || right_non_primary.case_quaternary() == 0
1705                    );
1706                    right_tertiary = right_non_primary.tertiary_case_quarternary(tertiary_mask);
1707                    if right_tertiary != 0 {
1708                        break;
1709                    }
1710                }
1711
1712                if left_tertiary != right_tertiary {
1713                    if self.options.upper_first() {
1714                        // Pass through NO_CE and keep real tertiary weights larger than that.
1715                        // Do not change the artificial uppercase weight of a tertiary CE (0.0.ut),
1716                        // to keep tertiary CEs well-formed.
1717                        // Their case+tertiary weights must be greater than those of
1718                        // primary and secondary CEs.
1719                        // Magic numbers from ICU4C.
1720                        if left_tertiary > NO_CE_TERTIARY {
1721                            if left_non_primary.secondary() != 0 {
1722                                left_tertiary ^= 0xC000;
1723                            } else {
1724                                left_tertiary += 0x4000;
1725                            }
1726                        }
1727                        if right_tertiary > NO_CE_TERTIARY {
1728                            if right_non_primary.secondary() != 0 {
1729                                right_tertiary ^= 0xC000;
1730                            } else {
1731                                right_tertiary += 0x4000;
1732                            }
1733                        }
1734                    }
1735                    if left_tertiary < right_tertiary {
1736                        return Ordering::Less;
1737                    }
1738                    return Ordering::Greater;
1739                }
1740
1741                if left_tertiary == NO_CE_TERTIARY {
1742                    break;
1743                }
1744            }
1745            if !any_variable && !any_quaternaries.has_quaternary() {
1746                return Ordering::Equal;
1747            }
1748        } else {
1749            return Ordering::Equal;
1750        }
1751
1752        if self.options.strength() <= Strength::Tertiary {
1753            return Ordering::Equal;
1754        }
1755
1756        let mut left_iter = left_ces.iter();
1757        let mut right_iter = right_ces.iter();
1758        loop {
1759            let mut left_quaternary;
1760            loop {
1761                let ce = left_iter.next().unwrap_or_default();
1762                if ce.tertiary_ignorable() {
1763                    left_quaternary = ce.primary();
1764                } else {
1765                    left_quaternary = ce.quaternary();
1766                }
1767                if left_quaternary != 0 {
1768                    break;
1769                }
1770            }
1771            let mut right_quaternary;
1772            loop {
1773                let ce = right_iter.next().unwrap_or_default();
1774                if ce.tertiary_ignorable() {
1775                    right_quaternary = ce.primary();
1776                } else {
1777                    right_quaternary = ce.quaternary();
1778                }
1779                if right_quaternary != 0 {
1780                    break;
1781                }
1782            }
1783            if left_quaternary != right_quaternary {
1784                if let Some(reordering) = &self.reordering {
1785                    left_quaternary = reordering.reorder(left_quaternary);
1786                    right_quaternary = reordering.reorder(right_quaternary);
1787                }
1788                if left_quaternary < right_quaternary {
1789                    return Ordering::Less;
1790                }
1791                return Ordering::Greater;
1792            }
1793            if left_quaternary == NO_CE_PRIMARY {
1794                break;
1795            }
1796        }
1797
1798        Ordering::Equal
1799    }
1800
1801    fn sort_key_levels(&self) -> u8 {
1802        #[expect(clippy::indexing_slicing)]
1803        let mut levels = LEVEL_MASKS[self.options.strength() as usize];
1804        if self.options.case_level() {
1805            levels |= CASE_LEVEL_FLAG;
1806        }
1807        levels
1808    }
1809
1810    /// Given valid UTF-8, write the sort key bytes up to the collator's strength.
1811    ///
1812    /// The bytes are written to an implementor of [`CollationKeySink`], a no-std version of `std::io::Write`.
1813    /// This trait is currently unstable, but it is implemented by `Vec<u8>`, `VecDeque<u8>`,
1814    /// `SmallVec<[u8; N]>`, and `&mut [u8]` (returning an error if the slice is too small).
1815    ///
1816    /// If two sort keys generated at the same strength are compared bytewise, the result is
1817    /// the same as a collation comparison of the original strings at that strength.
1818    ///
1819    /// For identical strength, the UTF-8 NFD normalization is appended for breaking ties.
1820    ///
1821    /// No terminating zero byte is written to the output, so the output is not a valid C
1822    /// string, but the caller may append a zero afterward if a C string is desired.
1823    ///
1824    /// ⚠️ Generating a sort key is expensive relative to comparison because to compare, the
1825    /// collator skips identical prefixes before doing more complex comparison.  Only use sort
1826    /// keys if you expect to compare them many times so as to amortize the cost of generating
1827    /// them.  Measurement of this performance trade-off would be a good idea.
1828    ///
1829    /// ⚠️ Sort keys, if stored durably, should be presumed to be invalidated by a CLDR update, a
1830    /// new version of Unicode, or an update to the ICU4X code.  Applications using sort keys
1831    /// *must* be prepared to recompute them if required and should take the performance of
1832    /// such an operation into account when deciding to use sort keys.
1833    ///
1834    /// ⚠️ If you should store sort keys in a database that is or becomes so large that
1835    /// regenerating sort keys becomes impractical, you should not expect ICU4X to support your
1836    /// using an older, frozen copy of the sort key generation algorithm with a later version
1837    /// of the library.
1838    ///
1839    /// # Example
1840    ///
1841    /// ```
1842    /// use icu_collator::{
1843    ///     options::{CollatorOptions, Strength},
1844    ///     Collator,
1845    /// };
1846    /// use icu_locale::locale;
1847    /// let locale = locale!("utf").into();
1848    /// let mut options = CollatorOptions::default();
1849    /// options.strength = Some(Strength::Primary);
1850    /// let collator = Collator::try_new(locale, options).unwrap();
1851    ///
1852    /// let mut k1 = Vec::new();
1853    /// let Ok(()) = collator.write_sort_key_to("hello", &mut k1);
1854    /// let mut k2 = Vec::new();
1855    /// let Ok(()) = collator.write_sort_key_to("Héłłö", &mut k2);
1856    /// assert_eq!(k1, k2);
1857    /// ```
1858    pub fn write_sort_key_to<S>(&self, s: &str, sink: &mut S) -> Result<S::Output, S::Error>
1859    where
1860        S: CollationKeySink + ?Sized,
1861        S::State: Default,
1862    {
1863        self.write_sort_key_impl(s.chars(), sink)
1864    }
1865
1866    /// Given potentially invalid UTF-8, write the sort key bytes up to the collator's strength.
1867    ///
1868    /// For further details, see [`Self::write_sort_key_to`].
1869    pub fn write_sort_key_utf8_to<S>(&self, s: &[u8], sink: &mut S) -> Result<S::Output, S::Error>
1870    where
1871        S: CollationKeySink + ?Sized,
1872        S::State: Default,
1873    {
1874        self.write_sort_key_impl(s.chars(), sink)
1875    }
1876
1877    /// Given potentially invalid UTF-16, write the sort key bytes up to the collator's strength.
1878    ///
1879    /// For further details, see [`Self::write_sort_key_to`].
1880    pub fn write_sort_key_utf16_to<S>(&self, s: &[u16], sink: &mut S) -> Result<S::Output, S::Error>
1881    where
1882        S: CollationKeySink + ?Sized,
1883        S::State: Default,
1884    {
1885        self.write_sort_key_impl(s.chars(), sink)
1886    }
1887
1888    fn write_sort_key_impl<I, S>(&self, iter: I, sink: &mut S) -> Result<S::Output, S::Error>
1889    where
1890        I: Iterator<Item = char> + Clone,
1891        S: CollationKeySink + ?Sized,
1892        S::State: Default,
1893    {
1894        let identical = if self.options.strength() == Strength::Identical {
1895            Some(iter.clone())
1896        } else {
1897            None
1898        };
1899
1900        let mut state = S::State::default();
1901        self.write_sort_key_up_to_quaternary(iter, sink, &mut state)?;
1902
1903        if let Some(iter) = identical {
1904            let nfd =
1905                DecomposingNormalizerBorrowed::new_with_data(self.decompositions, self.tables);
1906            sink.write_byte(&mut state, LEVEL_SEPARATOR_BYTE)?;
1907
1908            let iter = nfd.normalize_iter(iter);
1909            write_identical_level(iter, sink, &mut state)?;
1910        }
1911
1912        sink.finish(state)
1913    }
1914
1915    /// Write the sort key bytes up to the collator's strength.
1916    ///
1917    /// Optionally write the case level.  Separate levels with the `LEVEL_SEPARATOR_BYTE`, but
1918    /// do not write a terminating zero as with a C string.
1919    fn write_sort_key_up_to_quaternary<I, S>(
1920        &self,
1921        iter: I,
1922        sink: &mut S,
1923        state: &mut S::State,
1924    ) -> Result<(), S::Error>
1925    where
1926        I: Iterator<Item = char>,
1927        S: CollationKeySink + ?Sized,
1928    {
1929        // This algorithm comes from `CollationKeys::writeSortKeyUpToQuaternary` in ICU4C.
1930        let levels = self.sort_key_levels();
1931
1932        let mut iter =
1933            collation_elements!(self, iter, self.tailoring_or_root(), self.numeric_primary());
1934        iter.init();
1935        let variable_top = self.variable_top();
1936
1937        let tertiary_mask = self.options.tertiary_mask().unwrap_or_default();
1938
1939        let mut cases = SortKeyLevel::default();
1940        let mut secondaries = SortKeyLevel::default();
1941        let mut tertiaries = SortKeyLevel::default();
1942        let mut quaternaries = SortKeyLevel::default();
1943
1944        let mut prev_reordered_primary = 0;
1945        let mut common_cases = 0usize;
1946        let mut common_secondaries = 0usize;
1947        let mut common_tertiaries = 0usize;
1948        let mut common_quaternaries = 0usize;
1949        let mut prev_secondary = 0;
1950        let mut sec_segment_start = 0;
1951
1952        loop {
1953            let mut ce = iter.next();
1954            let mut p = ce.primary();
1955            if p < variable_top && p > MERGE_SEPARATOR_PRIMARY {
1956                // Variable CE, shift it to quaternary level.  Ignore all following primary
1957                // ignorables, and shift further variable CEs.
1958                if common_quaternaries != 0 {
1959                    common_quaternaries -= 1;
1960                    while common_quaternaries >= QUAT_COMMON[WEIGHT_MAX_COUNT] as _ {
1961                        quaternaries.append_byte(QUAT_COMMON[WEIGHT_MIDDLE]);
1962                        common_quaternaries -= QUAT_COMMON[WEIGHT_MAX_COUNT] as usize;
1963                    }
1964                    // Shifted primary weights are lower than the common weight.
1965                    quaternaries.append_byte(QUAT_COMMON[WEIGHT_LOW] + common_quaternaries as u8);
1966                    common_quaternaries = 0;
1967                }
1968
1969                loop {
1970                    if levels & QUATERNARY_LEVEL_FLAG != 0 {
1971                        if let Some(reordering) = &self.reordering {
1972                            p = reordering.reorder(p);
1973                        }
1974                        if (p >> 24) as u8 >= QUAT_SHIFTED_LIMIT_BYTE {
1975                            // Prevent shifted primary lead bytes from overlapping with the
1976                            // common compression range.
1977                            quaternaries.append_byte(QUAT_SHIFTED_LIMIT_BYTE);
1978                        }
1979                        quaternaries.append_weight_32(p);
1980                    }
1981                    loop {
1982                        ce = iter.next();
1983                        p = ce.primary();
1984                        if p != 0 {
1985                            break;
1986                        }
1987                    }
1988                    if !(p < variable_top && p > MERGE_SEPARATOR_PRIMARY) {
1989                        break;
1990                    }
1991                }
1992            }
1993
1994            // ce could be primary ignorable, or NO_CE, or the merge separator, or a regular
1995            // primary CE, but it is not variable.  If ce == NO_CE, then write nothing for the
1996            // primary level but terminate compression on all levels and then exit the loop.
1997            if p > NO_CE_PRIMARY && levels & PRIMARY_LEVEL_FLAG != 0 {
1998                // Test the un-reordered primary for compressibility.
1999                let is_compressible = self.special_primaries.is_compressible((p >> 24) as _);
2000                if let Some(reordering) = &self.reordering {
2001                    p = reordering.reorder(p);
2002                }
2003                let p1 = (p >> 24) as u8;
2004                if !is_compressible || p1 != (prev_reordered_primary >> 24) as u8 {
2005                    if prev_reordered_primary != 0 {
2006                        if p < prev_reordered_primary {
2007                            // No primary compression terminator at the end of the level or
2008                            // merged segment.
2009                            if p1 > MERGE_SEPARATOR_BYTE {
2010                                sink.write(state, &[PRIMARY_COMPRESSION_LOW_BYTE])?;
2011                            }
2012                        } else {
2013                            sink.write(state, &[PRIMARY_COMPRESSION_HIGH_BYTE])?;
2014                        }
2015                    }
2016                    sink.write_byte(state, p1)?;
2017                    prev_reordered_primary = if is_compressible { p } else { 0 };
2018                }
2019
2020                let p2 = (p >> 16) as u8;
2021                if p2 != 0 {
2022                    let (b0, b1, b2) = (p2, (p >> 8) as _, p as _);
2023                    sink.write_byte(state, b0)?;
2024                    if b1 != 0 {
2025                        sink.write_byte(state, b1)?;
2026                        if b2 != 0 {
2027                            sink.write_byte(state, b2)?;
2028                        }
2029                    }
2030                }
2031            }
2032
2033            let non_primary = ce.non_primary();
2034            if non_primary.ignorable() {
2035                continue; // completely ignorable, no secondary/case/tertiary/quaternary
2036            }
2037
2038            macro_rules! handle_common {
2039                ($key:ident, $w:ident, $common:ident, $weights:ident, $lim:expr) => {
2040                    if $common != 0 {
2041                        $common -= 1;
2042                        while $common >= $weights[WEIGHT_MAX_COUNT] as _ {
2043                            $key.append_byte($weights[WEIGHT_MIDDLE]);
2044                            $common -= $weights[WEIGHT_MAX_COUNT] as usize;
2045                        }
2046                        let b = if $w < $lim {
2047                            $weights[WEIGHT_LOW] + ($common as u8)
2048                        } else {
2049                            $weights[WEIGHT_HIGH] - ($common as u8)
2050                        };
2051                        $key.append_byte(b);
2052                        $common = 0;
2053                    }
2054                };
2055                ($key:ident, $w:ident, $common:ident, $weights:ident) => {
2056                    handle_common!($key, $w, $common, $weights, COMMON_WEIGHT16);
2057                };
2058            }
2059
2060            if levels & SECONDARY_LEVEL_FLAG != 0 {
2061                let s = non_primary.secondary();
2062                if s == 0 {
2063                    // secondary ignorable
2064                } else if s == COMMON_WEIGHT16
2065                    && (!self.options.backward_second_level() || p != MERGE_SEPARATOR_PRIMARY)
2066                {
2067                    // s is a common secondary weight, and backwards-secondary is off or the ce
2068                    // is not the merge separator.
2069                    common_secondaries += 1;
2070                } else if !self.options.backward_second_level() {
2071                    handle_common!(secondaries, s, common_secondaries, SEC_COMMON);
2072                    secondaries.append_weight_16(s);
2073                } else {
2074                    if common_secondaries != 0 {
2075                        common_secondaries -= 1;
2076                        // Append reverse weights.  The level will be re-reversed later.
2077                        let remainder = common_secondaries % SEC_COMMON[WEIGHT_MAX_COUNT] as usize;
2078                        let b = if prev_secondary < COMMON_WEIGHT16 {
2079                            SEC_COMMON[WEIGHT_LOW] + remainder as u8
2080                        } else {
2081                            SEC_COMMON[WEIGHT_HIGH] - remainder as u8
2082                        };
2083                        secondaries.append_byte(b);
2084                        common_secondaries -= remainder;
2085                        // common_secondaries is now a multiple of SEC_COMMON[WEIGHT_MAX_COUNT]
2086                        while common_secondaries > 0 {
2087                            // same as >= SEC_COMMON[WEIGHT_MAX_COUNT]
2088                            secondaries.append_byte(SEC_COMMON[WEIGHT_MIDDLE]);
2089                            common_secondaries -= SEC_COMMON[WEIGHT_MAX_COUNT] as usize;
2090                        }
2091                        // commonSecondaries == 0
2092                    }
2093                    if 0 < p && p <= MERGE_SEPARATOR_PRIMARY {
2094                        // The backwards secondary level compares secondary weights backwards
2095                        // within segments separated by the merge separator (U+FFFE).
2096                        let secs = &mut secondaries.buf;
2097                        let last = secs.len() - 1;
2098                        if sec_segment_start < last {
2099                            let mut q = sec_segment_start;
2100                            let mut r = last;
2101
2102                            // these indices start at valid values and we stop when they cross
2103                            #[expect(clippy::indexing_slicing)]
2104                            while q < r {
2105                                let b = secs[q];
2106                                secs[q] = secs[r];
2107                                q += 1;
2108                                secs[r] = b;
2109                                r -= 1;
2110                            }
2111                        }
2112                        let b = if p == NO_CE_PRIMARY {
2113                            LEVEL_SEPARATOR_BYTE
2114                        } else {
2115                            MERGE_SEPARATOR_BYTE
2116                        };
2117                        secondaries.append_byte(b);
2118                        prev_secondary = 0;
2119                        sec_segment_start = secondaries.len();
2120                    } else {
2121                        secondaries.append_reverse_weight_16(s);
2122                        prev_secondary = s;
2123                    }
2124                }
2125            }
2126
2127            if levels & CASE_LEVEL_FLAG != 0 {
2128                if self.options.strength() == Strength::Primary && p == 0
2129                    || non_primary.bits() <= 0xffff
2130                {
2131                    // Primary+caseLevel: Ignore case level weights of primary ignorables.
2132                    // Otherwise: Ignore case level weights of secondary ignorables.  For
2133                    // details see the comments in the CollationCompare class.
2134                } else {
2135                    // case bits & tertiary lead byte
2136                    let mut c = ((non_primary.bits() >> 8) & 0xff) as u8;
2137                    debug_assert_ne!(c & 0xc0, 0xc0);
2138                    if c & 0xc0 == 0 && c > LEVEL_SEPARATOR_BYTE {
2139                        common_cases += 1;
2140                    } else {
2141                        if !self.options.upper_first() {
2142                            // lower first:  Compress common weights to nibbles 1..7..13,
2143                            // mixed=14, upper=15.  If there are only common (=lowest) weights
2144                            // in the whole level, then we need not write anything.  Level
2145                            // length differences are handled already on the next-higher level.
2146                            if common_cases != 0 && (c > LEVEL_SEPARATOR_BYTE || !cases.is_empty())
2147                            {
2148                                common_cases -= 1;
2149                                while common_cases >= CASE_LOWER_FIRST_COMMON[WEIGHT_MAX_COUNT] as _
2150                                {
2151                                    cases.append_byte(CASE_LOWER_FIRST_COMMON[WEIGHT_MIDDLE] << 4);
2152                                    common_cases -=
2153                                        CASE_LOWER_FIRST_COMMON[WEIGHT_MAX_COUNT] as usize;
2154                                }
2155                                let b = if c <= LEVEL_SEPARATOR_BYTE {
2156                                    CASE_LOWER_FIRST_COMMON[WEIGHT_LOW] + common_cases as u8
2157                                } else {
2158                                    CASE_LOWER_FIRST_COMMON[WEIGHT_HIGH] - common_cases as u8
2159                                };
2160                                cases.append_byte(b << 4);
2161                                common_cases = 0;
2162                            }
2163                            if c > LEVEL_SEPARATOR_BYTE {
2164                                // 14 or 15
2165                                c = (CASE_LOWER_FIRST_COMMON[WEIGHT_HIGH] + (c >> 6)) << 4;
2166                            }
2167                        } else {
2168                            // upper first:  Compress common weights to nibbles 3..15, mixed=2,
2169                            // upper=1.  The compressed common case weights only go up from the
2170                            // "low" value because with upperFirst the common weight is the
2171                            // highest one.
2172                            if common_cases != 0 {
2173                                common_cases -= 1;
2174                                while common_cases >= CASE_UPPER_FIRST_COMMON[WEIGHT_MAX_COUNT] as _
2175                                {
2176                                    cases.append_byte(CASE_UPPER_FIRST_COMMON[WEIGHT_LOW] << 4);
2177                                    common_cases -=
2178                                        CASE_UPPER_FIRST_COMMON[WEIGHT_MAX_COUNT] as usize;
2179                                }
2180                                cases.append_byte(
2181                                    (CASE_UPPER_FIRST_COMMON[WEIGHT_LOW] + common_cases as u8) << 4,
2182                                );
2183                                common_cases = 0;
2184                            }
2185                            if c > LEVEL_SEPARATOR_BYTE {
2186                                // 2 or 1
2187                                c = (CASE_UPPER_FIRST_COMMON[WEIGHT_LOW] - (c >> 6)) << 4;
2188                            }
2189                        }
2190                        // c is a separator byte 01 or a left-shifted nibble 0x10, 0x20, ...
2191                        // 0xf0.
2192                        cases.append_byte(c);
2193                    }
2194                }
2195            }
2196
2197            if levels & TERTIARY_LEVEL_FLAG != 0 {
2198                let mut t = non_primary.tertiary_case_quarternary(tertiary_mask);
2199                debug_assert_ne!(non_primary.bits() & 0xc000, 0xc000);
2200                if t == COMMON_WEIGHT16 {
2201                    common_tertiaries += 1;
2202                } else if tertiary_mask & 0x8000 == 0 {
2203                    // Tertiary weights without case bits.  Move lead bytes 06..3F to C6..FF
2204                    // for a large common-weight range.
2205                    handle_common!(tertiaries, t, common_tertiaries, TER_ONLY_COMMON);
2206                    if t > COMMON_WEIGHT16 {
2207                        t += 0xc000;
2208                    }
2209                    tertiaries.append_weight_16(t);
2210                } else if !self.options.upper_first() {
2211                    // Tertiary weights with caseFirst=lowerFirst.  Move lead bytes 06..BF to
2212                    // 46..FF for the common-weight range.
2213                    handle_common!(tertiaries, t, common_tertiaries, TER_LOWER_FIRST_COMMON);
2214                    if t > COMMON_WEIGHT16 {
2215                        t += 0x4000;
2216                    }
2217                    tertiaries.append_weight_16(t);
2218                } else {
2219                    // Tertiary weights with caseFirst=upperFirst.  Do not change the
2220                    // artificial uppercase weight of a tertiary CE (0.0.ut), to keep tertiary
2221                    // CEs well-formed.  Their case+tertiary weights must be greater than those
2222                    // of primary and secondary CEs.
2223                    //
2224                    // Separator         01 -> 01      (unchanged)
2225                    // Lowercase     02..04 -> 82..84  (includes uncased)
2226                    // Common weight     05 -> 85..C5  (common-weight compression range)
2227                    // Lowercase     06..3F -> C6..FF
2228                    // Mixed case    42..7F -> 42..7F
2229                    // Uppercase     82..BF -> 02..3F
2230                    // Tertiary CE   86..BF -> C6..FF
2231                    if t <= NO_CE_TERTIARY {
2232                        // Keep separators unchanged.
2233                    } else if non_primary.bits() > 0xffff {
2234                        // Invert case bits of primary & secondary CEs.
2235                        t ^= 0xc000;
2236                        if t < (TER_UPPER_FIRST_COMMON[WEIGHT_HIGH] as u16) << 8 {
2237                            t -= 0x4000;
2238                        }
2239                    } else {
2240                        // Keep uppercase bits of tertiary CEs.
2241                        debug_assert!((0x8600..=0xbfff).contains(&t));
2242                        t += 0x4000;
2243                    }
2244                    handle_common!(
2245                        tertiaries,
2246                        t,
2247                        common_tertiaries,
2248                        TER_UPPER_FIRST_COMMON,
2249                        (TER_UPPER_FIRST_COMMON[WEIGHT_LOW] as u16) << 8
2250                    );
2251                    tertiaries.append_weight_16(t);
2252                }
2253            }
2254
2255            if levels & QUATERNARY_LEVEL_FLAG != 0 {
2256                let q = (non_primary.bits() & 0xffff) as u16;
2257                if q & 0xc0 == 0 && q > NO_CE_QUATERNARY {
2258                    common_quaternaries += 1;
2259                } else if q == NO_CE_QUATERNARY
2260                    && self.options.alternate_handling() == AlternateHandling::NonIgnorable
2261                    && quaternaries.is_empty()
2262                {
2263                    // If alternate=non-ignorable and there are only common quaternary weights,
2264                    // then we need not write anything.  The only weights greater than the
2265                    // merge separator and less than the common weight are shifted primary
2266                    // weights, which are not generated for alternate=non-ignorable.  There are
2267                    // also exactly as many quaternary weights as tertiary weights, so level
2268                    // length differences are handled already on tertiary level.  Any
2269                    // above-common quaternary weight will compare greater regardless.
2270                    quaternaries.append_byte(LEVEL_SEPARATOR_BYTE);
2271                } else {
2272                    let q = if q == NO_CE_QUATERNARY {
2273                        LEVEL_SEPARATOR_BYTE
2274                    } else {
2275                        (0xfc + ((q >> 6) & 3)) as u8
2276                    };
2277                    handle_common!(
2278                        quaternaries,
2279                        q,
2280                        common_quaternaries,
2281                        QUAT_COMMON,
2282                        QUAT_COMMON[WEIGHT_LOW]
2283                    );
2284                    quaternaries.append_byte(q);
2285                }
2286            }
2287
2288            if (non_primary.bits() >> 24) as u8 == LEVEL_SEPARATOR_BYTE {
2289                break; // ce == NO_CE
2290            }
2291        }
2292
2293        macro_rules! write_level {
2294            ($key:ident, $flag:ident) => {
2295                if levels & $flag != 0 {
2296                    sink.write(state, &[LEVEL_SEPARATOR_BYTE])?;
2297                    sink.write(state, &$key.buf)?;
2298                }
2299            };
2300        }
2301
2302        write_level!(secondaries, SECONDARY_LEVEL_FLAG);
2303
2304        if levels & CASE_LEVEL_FLAG != 0 {
2305            sink.write(state, &[LEVEL_SEPARATOR_BYTE])?;
2306
2307            // Write pairs of nibbles as bytes, except separator bytes as themselves.
2308            let mut b = 0;
2309            if let Some((last, head)) = cases.buf.split_last() {
2310                debug_assert_eq!(*last, 1); // The trailing NO_CE
2311                for c in head {
2312                    debug_assert_eq!(*c & 0xf, 0);
2313                    debug_assert_ne!(*c, 0);
2314                    if b == 0 {
2315                        b = *c;
2316                    } else {
2317                        sink.write_byte(state, b | (*c >> 4))?;
2318                        b = 0;
2319                    }
2320                }
2321            } else {
2322                debug_assert!(false);
2323            }
2324            if b != 0 {
2325                sink.write_byte(state, b)?;
2326            }
2327        }
2328
2329        write_level!(tertiaries, TERTIARY_LEVEL_FLAG);
2330        write_level!(quaternaries, QUATERNARY_LEVEL_FLAG);
2331
2332        Ok(())
2333    }
2334}
2335
2336/// Error indicating that a [`CollationKeySink`] with limited space ran out of space.
2337#[derive(Debug, PartialEq, Eq)]
2338pub struct TooSmall {
2339    /// The total length, in bytes, of the sort key.
2340    pub length: usize,
2341}
2342
2343impl TooSmall {
2344    pub fn new(length: usize) -> Self {
2345        Self { length }
2346    }
2347}
2348
2349/// A [`std::io::Write`]-like trait for writing to a buffer-like object.
2350///
2351/// (This crate does not have access to [`std`].)
2352///
2353/// <div class="stab unstable">
2354/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
2355/// including in SemVer minor releases. Do not implement or call methods on this trait
2356/// unless you are prepared for things to occasionally break.
2357///
2358/// Graduation tracking issue: [issue #7178](https://github.com/unicode-org/icu4x/issues/7178).
2359/// </div>
2360///
2361/// ✨ *Enabled with the `unstable` Cargo feature.*
2362pub trait CollationKeySink {
2363    /// The type of error the sink may return.
2364    type Error;
2365
2366    /// An intermediate state object used by the sink, which must implement [`Default`].
2367    type State;
2368
2369    /// A result value indicating the final state of the sink (e.g. a number of bytes written).
2370    type Output;
2371
2372    /// Writes a buffer into the writer.
2373    fn write(&mut self, state: &mut Self::State, buf: &[u8]) -> Result<(), Self::Error>;
2374
2375    /// Write a single byte into the writer.
2376    fn write_byte(&mut self, state: &mut Self::State, b: u8) -> Result<(), Self::Error> {
2377        self.write(state, &[b])
2378    }
2379
2380    /// Finalize any internal sink state (perhaps by flushing a buffer) and return the final
2381    /// output value.
2382    fn finish(&mut self, state: Self::State) -> Result<Self::Output, Self::Error>;
2383}
2384
2385impl CollationKeySink for Vec<u8> {
2386    type Error = Infallible;
2387    type State = ();
2388    type Output = ();
2389
2390    fn write(&mut self, _: &mut Self::State, buf: &[u8]) -> Result<(), Self::Error> {
2391        self.extend_from_slice(buf);
2392        Ok(())
2393    }
2394
2395    fn finish(&mut self, _: Self::State) -> Result<Self::Output, Self::Error> {
2396        Ok(())
2397    }
2398}
2399
2400impl CollationKeySink for VecDeque<u8> {
2401    type Error = Infallible;
2402    type State = ();
2403    type Output = ();
2404
2405    fn write(&mut self, _: &mut Self::State, buf: &[u8]) -> Result<(), Self::Error> {
2406        self.extend(buf.iter());
2407        Ok(())
2408    }
2409
2410    fn finish(&mut self, _: Self::State) -> Result<Self::Output, Self::Error> {
2411        Ok(())
2412    }
2413}
2414
2415impl<const N: usize> CollationKeySink for SmallVec<[u8; N]> {
2416    type Error = Infallible;
2417    type State = ();
2418    type Output = ();
2419
2420    fn write(&mut self, _: &mut Self::State, buf: &[u8]) -> Result<(), Self::Error> {
2421        self.extend_from_slice(buf);
2422        Ok(())
2423    }
2424
2425    fn finish(&mut self, _: Self::State) -> Result<Self::Output, Self::Error> {
2426        Ok(())
2427    }
2428}
2429
2430impl CollationKeySink for [u8] {
2431    type Error = TooSmall;
2432    type State = usize;
2433    type Output = usize;
2434
2435    fn write(&mut self, offset: &mut Self::State, buf: &[u8]) -> Result<(), Self::Error> {
2436        if *offset + buf.len() <= self.len() {
2437            // just checked bounds
2438            #[expect(clippy::indexing_slicing)]
2439            self[*offset..*offset + buf.len()].copy_from_slice(buf);
2440        }
2441        *offset += buf.len();
2442        Ok(())
2443    }
2444
2445    fn finish(&mut self, offset: Self::State) -> Result<Self::Output, Self::Error> {
2446        if offset <= self.len() {
2447            Ok(offset)
2448        } else {
2449            Err(TooSmall::new(offset))
2450        }
2451    }
2452}
2453
2454#[derive(Default)]
2455struct SortKeyLevel {
2456    buf: SmallVec<[u8; 40]>,
2457}
2458
2459impl SortKeyLevel {
2460    fn len(&self) -> usize {
2461        self.buf.len()
2462    }
2463
2464    fn is_empty(&self) -> bool {
2465        self.buf.is_empty()
2466    }
2467
2468    fn append_byte(&mut self, x: u8) {
2469        self.buf.push(x);
2470    }
2471
2472    fn append_weight_16(&mut self, w: u16) {
2473        debug_assert_ne!(w, 0);
2474        let b0 = (w >> 8) as u8;
2475        let b1 = w as u8;
2476        self.append_byte(b0);
2477        if b1 != 0 {
2478            self.append_byte(b1);
2479        }
2480    }
2481
2482    fn append_reverse_weight_16(&mut self, w: u16) {
2483        debug_assert_ne!(w, 0);
2484        let b0 = (w >> 8) as u8;
2485        let b1 = w as u8;
2486        if b1 != 0 {
2487            self.append_byte(b1);
2488        }
2489        self.append_byte(b0);
2490    }
2491
2492    fn append_weight_32(&mut self, w: u32) {
2493        debug_assert_ne!(w, 0);
2494        let b0 = (w >> 24) as u8;
2495        let b1 = (w >> 16) as u8;
2496        let b2 = (w >> 8) as u8;
2497        let b3 = w as u8;
2498        self.append_byte(b0);
2499        if b1 != 0 {
2500            self.append_byte(b1);
2501            if b2 != 0 {
2502                self.append_byte(b2);
2503                if b3 != 0 {
2504                    self.append_byte(b3);
2505                }
2506            }
2507        }
2508    }
2509}
2510
2511// The algorithm below (BOCSU or Binary Ordered Compression Scheme for Unicode) is translated
2512// from the C++ code in ICU4C at icu4c/source/i18n/bocsu.{cpp,h}.  The algorithm works by
2513// converting a sequence of codepoints into a sequence of presumably small differences.  See
2514// the C++ code for a more detailed explanation.
2515
2516macro_rules! negdivmod {
2517    ($n:ident, $d:ident, $m:ident) => {
2518        $m = $n % $d;
2519        $n /= $d;
2520        if $m < 0 {
2521            $n -= 1;
2522            $m += $d;
2523        }
2524    };
2525}
2526
2527fn write_diff<S>(mut diff: i32, sink: &mut S, state: &mut S::State) -> Result<(), S::Error>
2528where
2529    S: CollationKeySink + ?Sized,
2530{
2531    let mut out = |b| sink.write_byte(state, b);
2532
2533    if diff >= SLOPE_REACH_NEG_1 {
2534        if diff <= SLOPE_REACH_POS_1 {
2535            out((SLOPE_MIDDLE + diff) as _)?;
2536        } else if diff <= SLOPE_REACH_POS_2 {
2537            out((SLOPE_START_POS_2 + (diff / SLOPE_TAIL_COUNT)) as _)?;
2538            out((SLOPE_MIN + diff % SLOPE_TAIL_COUNT) as _)?;
2539        } else if diff <= SLOPE_REACH_POS_3 {
2540            let p2 = SLOPE_MIN + diff % SLOPE_TAIL_COUNT;
2541            diff /= SLOPE_TAIL_COUNT;
2542            let p1 = SLOPE_MIN + diff % SLOPE_TAIL_COUNT;
2543            let p0 = SLOPE_START_POS_3 + (diff / SLOPE_TAIL_COUNT);
2544            out(p0 as _)?;
2545            out(p1 as _)?;
2546            out(p2 as _)?;
2547        } else {
2548            let p3 = SLOPE_MIN + diff % SLOPE_TAIL_COUNT;
2549            diff /= SLOPE_TAIL_COUNT;
2550            let p2 = SLOPE_MIN + diff % SLOPE_TAIL_COUNT;
2551            diff /= SLOPE_TAIL_COUNT;
2552            let p1 = SLOPE_MIN + diff % SLOPE_TAIL_COUNT;
2553            out(SLOPE_MAX as _)?;
2554            out(p1 as _)?;
2555            out(p2 as _)?;
2556            out(p3 as _)?;
2557        }
2558    } else {
2559        let mut m;
2560
2561        if diff >= SLOPE_REACH_NEG_2 {
2562            negdivmod!(diff, SLOPE_TAIL_COUNT, m);
2563            out((SLOPE_START_NEG_2 + diff) as _)?;
2564            out((SLOPE_MIN + m) as _)?;
2565        } else if diff >= SLOPE_REACH_NEG_3 {
2566            negdivmod!(diff, SLOPE_TAIL_COUNT, m);
2567            let p2 = SLOPE_MIN + m;
2568            negdivmod!(diff, SLOPE_TAIL_COUNT, m);
2569            let p1 = SLOPE_MIN + m;
2570            let p0 = SLOPE_START_NEG_3 + diff;
2571            out(p0 as _)?;
2572            out(p1 as _)?;
2573            out(p2 as _)?;
2574        } else {
2575            negdivmod!(diff, SLOPE_TAIL_COUNT, m);
2576            let p3 = SLOPE_MIN + m;
2577            negdivmod!(diff, SLOPE_TAIL_COUNT, m);
2578            let p2 = SLOPE_MIN + m;
2579            negdivmod!(diff, SLOPE_TAIL_COUNT, m);
2580            let p1 = SLOPE_MIN + m;
2581            let _ = diff;
2582            out(SLOPE_MIN as _)?;
2583            out(p1 as _)?;
2584            out(p2 as _)?;
2585            out(p3 as _)?;
2586        }
2587    }
2588
2589    Ok(())
2590}
2591
2592fn write_identical_level<I, S>(iter: I, sink: &mut S, state: &mut S::State) -> Result<(), S::Error>
2593where
2594    I: Iterator<Item = char>,
2595    S: CollationKeySink + ?Sized,
2596{
2597    let mut prev = 0i32;
2598
2599    for c in iter {
2600        if !(0x4e00..=0xa000).contains(&prev) {
2601            prev = (prev & !0x7f) - SLOPE_REACH_NEG_1;
2602        } else {
2603            // Unihan U+4e00..U+9fa5:  double-bytes down from the upper end
2604            prev = 0x9fff - SLOPE_REACH_POS_2;
2605        }
2606
2607        if c == MERGE_SEPARATOR {
2608            sink.write_byte(state, MERGE_SEPARATOR_BYTE)?;
2609            prev = 0;
2610        } else {
2611            let c = c as i32;
2612            write_diff(c - prev, sink, state)?;
2613            prev = c;
2614        }
2615    }
2616    Ok(())
2617}
2618
2619#[cfg(test)]
2620mod test {
2621    use super::*;
2622    use icu_locale::locale;
2623
2624    type Key = Vec<u8>;
2625
2626    fn collator_en(strength: Strength) -> CollatorBorrowed<'static> {
2627        let locale = locale!("en").into();
2628        let mut options = CollatorOptions::default();
2629        options.strength = Some(strength);
2630        Collator::try_new(locale, options).unwrap()
2631    }
2632
2633    fn collator_en_case_level(strength: Strength) -> CollatorBorrowed<'static> {
2634        let locale = locale!("en").into();
2635        let mut options = CollatorOptions::default();
2636        options.strength = Some(strength);
2637        options.case_level = Some(crate::options::CaseLevel::On);
2638        Collator::try_new(locale, options).unwrap()
2639    }
2640
2641    fn keys(strength: Strength) -> (Key, Key, Key) {
2642        let collator = collator_en(strength);
2643
2644        let mut k0 = Vec::new();
2645        let Ok(()) = collator.write_sort_key_to("aabc", &mut k0);
2646        let mut k1 = Vec::new();
2647        let Ok(()) = collator.write_sort_key_to("aAbc", &mut k1);
2648        let mut k2 = Vec::new();
2649        let Ok(()) = collator.write_sort_key_to("áAbc", &mut k2);
2650
2651        (k0, k1, k2)
2652    }
2653
2654    #[test]
2655    fn sort_key_primary() {
2656        let (k0, k1, k2) = keys(Strength::Primary);
2657        assert_eq!(k0, k1);
2658        assert_eq!(k1, k2);
2659    }
2660
2661    #[test]
2662    fn sort_key_secondary() {
2663        let (k0, k1, k2) = keys(Strength::Secondary);
2664        assert_eq!(k0, k1);
2665        assert!(k1 < k2);
2666    }
2667
2668    #[test]
2669    fn sort_key_tertiary() {
2670        let (k0, k1, k2) = keys(Strength::Tertiary);
2671        assert!(k0 < k1);
2672        assert!(k1 < k2);
2673    }
2674
2675    fn collator_ja(strength: Strength) -> CollatorBorrowed<'static> {
2676        let locale = locale!("ja").into();
2677        let mut options = CollatorOptions::default();
2678        options.strength = Some(strength);
2679        Collator::try_new(locale, options).unwrap()
2680    }
2681
2682    fn keys_ja_strs(strength: Strength, s0: &str, s1: &str) -> (Key, Key) {
2683        let collator = collator_ja(strength);
2684
2685        let mut k0 = Vec::new();
2686        let Ok(()) = collator.write_sort_key_to(s0, &mut k0);
2687        let mut k1 = Vec::new();
2688        let Ok(()) = collator.write_sort_key_to(s1, &mut k1);
2689
2690        (k0, k1)
2691    }
2692
2693    fn keys_ja(strength: Strength) -> (Key, Key) {
2694        keys_ja_strs(strength, "あ", "ア")
2695    }
2696
2697    #[test]
2698    fn sort_keys_ja_to_quaternary() {
2699        let (k0, k1) = keys_ja(Strength::Primary);
2700        assert_eq!(k0, k1);
2701        let (k0, k1) = keys_ja(Strength::Secondary);
2702        assert_eq!(k0, k1);
2703        let (k0, k1) = keys_ja(Strength::Tertiary);
2704        assert_eq!(k0, k1);
2705        let (k0, k1) = keys_ja(Strength::Quaternary);
2706        assert!(k0 < k1);
2707    }
2708
2709    #[test]
2710    fn sort_keys_ja_identical() {
2711        let (k0, k1) = keys_ja_strs(Strength::Quaternary, "ア", "ｱ");
2712        assert_eq!(k0, k1);
2713        let (k0, k1) = keys_ja_strs(Strength::Identical, "ア", "ｱ");
2714        assert!(k0 < k1);
2715    }
2716
2717    #[test]
2718    fn sort_keys_utf16() {
2719        let collator = collator_en(Strength::Identical);
2720
2721        const STR8: &[u8] = b"hello world!";
2722        let mut k8 = Vec::new();
2723        let Ok(()) = collator.write_sort_key_utf8_to(STR8, &mut k8);
2724
2725        const STR16: &[u16] = &[
2726            0x68, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x77, 0x6f, 0x72, 0x6c, 0x64, 0x21,
2727        ];
2728        let mut k16 = Vec::new();
2729        let Ok(()) = collator.write_sort_key_utf16_to(STR16, &mut k16);
2730        assert_eq!(k8, k16);
2731    }
2732
2733    #[test]
2734    fn sort_keys_invalid() {
2735        let collator = collator_en(Strength::Identical);
2736
2737        // some invalid strings
2738        let mut k = Vec::new();
2739        let Ok(()) = collator.write_sort_key_utf8_to(b"\xf0\x90", &mut k);
2740        let mut k = Vec::new();
2741        let Ok(()) = collator.write_sort_key_utf16_to(&[0xdd1e], &mut k);
2742    }
2743
2744    #[test]
2745    fn sort_key_to_vecdeque() {
2746        let collator = collator_en(Strength::Identical);
2747
2748        let mut k0 = Vec::new();
2749        let Ok(()) = collator.write_sort_key_to("áAbc", &mut k0);
2750        let mut k1 = VecDeque::new();
2751        let Ok(()) = collator.write_sort_key_to("áAbc", &mut k1);
2752        assert!(k0.iter().eq(k1.iter()));
2753    }
2754
2755    #[test]
2756    fn sort_key_to_slice() {
2757        let collator = collator_en(Strength::Identical);
2758
2759        let mut k0 = Vec::new();
2760        let Ok(()) = collator.write_sort_key_to("áAbc", &mut k0);
2761        let mut k1 = [0u8; 100];
2762        let len = collator.write_sort_key_to("áAbc", &mut k1[..]).unwrap();
2763        assert_eq!(len, k0.len());
2764        assert!(k0.iter().eq(k1[..len].iter()));
2765    }
2766
2767    #[test]
2768    fn sort_key_to_slice_no_space() {
2769        let collator = collator_en(Strength::Identical);
2770        let mut k = [0u8; 0];
2771        let res = collator.write_sort_key_to("áAbc", &mut k[..]);
2772        assert!(matches!(res, Err(TooSmall { .. })));
2773    }
2774
2775    #[test]
2776    fn sort_key_to_slice_too_long() {
2777        // This runs out of space in write_sort_key_up_to_quaternary.
2778        let collator = collator_en(Strength::Identical);
2779        let mut k = [0u8; 5];
2780        let res = collator.write_sort_key_to("áAbc", &mut k[..]);
2781        assert!(matches!(res, Err(TooSmall { .. })));
2782    }
2783
2784    #[test]
2785    fn sort_key_to_slice_identical_too_long() {
2786        // This runs out of space while appending UTF-8 in the SinkAdapter.
2787        let collator = collator_en(Strength::Identical);
2788        let mut k = [0u8; 22];
2789        let res = collator.write_sort_key_to("áAbc", &mut k[..]);
2790        assert!(matches!(res, Err(TooSmall { .. })));
2791    }
2792
2793    #[test]
2794    fn sort_key_just_right() {
2795        // get the length needed
2796        let collator = collator_en(Strength::Identical);
2797        let mut k = [0u8; 0];
2798        let res = collator.write_sort_key_to("áAbc", &mut k[..]);
2799        let len = res.unwrap_err().length;
2800
2801        // almost enough
2802        let mut k = vec![0u8; len - 1];
2803        let res = collator.write_sort_key_to("áAbc", &mut k[..]);
2804        let len = res.unwrap_err().length;
2805
2806        // just right
2807        let mut k = vec![0u8; len];
2808        let res = collator.write_sort_key_to("áAbc", &mut k[..]);
2809        assert_eq!(res, Ok(len));
2810    }
2811
2812    #[test]
2813    fn sort_key_utf16_slice_too_small() {
2814        let collator = collator_en(Strength::Identical);
2815        const STR16: &[u16] = &[0x68, 0x65, 0x6c, 0x6c, 0x6f];
2816        let mut k = [0u8; 4];
2817        let res = collator.write_sort_key_utf16_to(STR16, &mut k[..]);
2818        assert!(matches!(res, Err(TooSmall { .. })));
2819    }
2820
2821    #[test]
2822    fn sort_key_very_long() {
2823        let collator = collator_en(Strength::Secondary);
2824        let mut k = Vec::new();
2825        let Ok(()) = collator.write_sort_key_to(&"a".repeat(300), &mut k);
2826    }
2827
2828    #[test]
2829    fn sort_key_case_level() {
2830        let collator = collator_en_case_level(Strength::Tertiary);
2831        let mut k = Vec::new();
2832        let Ok(()) = collator.write_sort_key_to("aBc", &mut k);
2833    }
2834
2835    #[test]
2836    fn sort_key_case_level_empty() {
2837        let collator = collator_en_case_level(Strength::Tertiary);
2838        let mut k = Vec::new();
2839        let Ok(()) = collator.write_sort_key_to("", &mut k);
2840    }
2841
2842    fn check_sort_key_less(a: &[u16], b: &[u16]) {
2843        let collator = collator_en(Strength::Identical);
2844        let mut ak = Vec::new();
2845        let Ok(()) = collator.write_sort_key_utf16_to(a, &mut ak);
2846        let mut bk = Vec::new();
2847        let Ok(()) = collator.write_sort_key_utf16_to(b, &mut bk);
2848        assert!(ak < bk, "failed: {a:04x?} - {b:04x?}");
2849    }
2850
2851    #[test]
2852    fn sort_key_fffe_bug_6811() {
2853        check_sort_key_less(
2854            &[0xfffe, 0x0001, 0x0002, 0x0003],
2855            &[0x0001, 0xfffe, 0x0002, 0x0003],
2856        );
2857        check_sort_key_less(
2858            &[0x0001, 0xfffe, 0x0002, 0x0003],
2859            &[0x0001, 0x0002, 0xfffe, 0x0003],
2860        );
2861        check_sort_key_less(
2862            &[0x0001, 0x0002, 0xfffe, 0x0003],
2863            &[0x0001, 0x0002, 0x0003, 0xfffe],
2864        );
2865        check_sort_key_less(&[0xfffe, 0x0000, 0x0000], &[0x0000, 0xfffe, 0x0000]);
2866        check_sort_key_less(&[0x0000, 0xfffe, 0x0000], &[0x0000, 0x0000, 0xfffe]);
2867    }
2868}
icu_collator/comparison.rs

icu_collator/
comparison.rs