edit/
icu.rs

1// Copyright (c) Microsoft Corporation.
2// Licensed under the MIT License.
3
4//! Bindings to the ICU library.
5
6use std::cmp::Ordering;
7use std::ffi::CStr;
8use std::mem;
9use std::mem::MaybeUninit;
10use std::ops::Range;
11use std::ptr::{null, null_mut};
12
13use crate::arena::{Arena, ArenaString, scratch_arena};
14use crate::buffer::TextBuffer;
15use crate::unicode::Utf8Chars;
16use crate::{apperr, arena_format, sys};
17
18#[derive(Clone, Copy)]
19pub struct Encoding {
20    pub label: &'static str,
21    pub canonical: &'static str,
22}
23
24pub struct Encodings {
25    pub preferred: &'static [Encoding],
26    pub all: &'static [Encoding],
27}
28
29static mut ENCODINGS: Encodings = Encodings { preferred: &[], all: &[] };
30
31/// Returns a list of encodings ICU supports.
32pub fn get_available_encodings() -> &'static Encodings {
33    // OnceCell for people that want to put it into a static.
34    #[allow(static_mut_refs)]
35    unsafe {
36        if ENCODINGS.all.is_empty() {
37            let scratch = scratch_arena(None);
38            let mut preferred = Vec::new_in(&*scratch);
39            let mut alternative = Vec::new_in(&*scratch);
40
41            // These encodings are always available.
42            preferred.push(Encoding { label: "UTF-8", canonical: "UTF-8" });
43            preferred.push(Encoding { label: "UTF-8 BOM", canonical: "UTF-8 BOM" });
44
45            if let Ok(f) = init_if_needed() {
46                let mut n = 0;
47                loop {
48                    let name = (f.ucnv_getAvailableName)(n);
49                    if name.is_null() {
50                        break;
51                    }
52
53                    n += 1;
54
55                    let name = CStr::from_ptr(name).to_str().unwrap_unchecked();
56                    // We have already pushed UTF-8 above and can skip it.
57                    // There is no need to filter UTF-8 BOM here,
58                    // since ICU does not distinguish it from UTF-8.
59                    if name.is_empty() || name == "UTF-8" {
60                        continue;
61                    }
62
63                    let mut status = icu_ffi::U_ZERO_ERROR;
64                    let mime = (f.ucnv_getStandardName)(
65                        name.as_ptr(),
66                        c"MIME".as_ptr() as *const _,
67                        &mut status,
68                    );
69                    if !mime.is_null() && status.is_success() {
70                        let mime = CStr::from_ptr(mime).to_str().unwrap_unchecked();
71                        preferred.push(Encoding { label: mime, canonical: name });
72                    } else {
73                        alternative.push(Encoding { label: name, canonical: name });
74                    }
75                }
76            }
77
78            let preferred_len = preferred.len();
79
80            // Combine the preferred and alternative encodings into a single list.
81            let mut all = Vec::with_capacity(preferred.len() + alternative.len());
82            all.extend(preferred);
83            all.extend(alternative);
84
85            let all = all.leak();
86            ENCODINGS.preferred = &all[..preferred_len];
87            ENCODINGS.all = &all[..];
88        }
89
90        &ENCODINGS
91    }
92}
93
94/// Formats the given ICU error code into a human-readable string.
95pub fn apperr_format(f: &mut std::fmt::Formatter<'_>, code: u32) -> std::fmt::Result {
96    fn format(code: u32) -> &'static str {
97        let Ok(f) = init_if_needed() else {
98            return "";
99        };
100
101        let status = icu_ffi::UErrorCode::new(code);
102        let ptr = unsafe { (f.u_errorName)(status) };
103        if ptr.is_null() {
104            return "";
105        }
106
107        let str = unsafe { CStr::from_ptr(ptr) };
108        str.to_str().unwrap_or("")
109    }
110
111    let msg = format(code);
112    if !msg.is_empty() {
113        write!(f, "ICU Error: {msg}")
114    } else {
115        write!(f, "ICU Error: {code:#08x}")
116    }
117}
118
119/// Converts between two encodings using ICU.
120pub struct Converter<'pivot> {
121    source: *mut icu_ffi::UConverter,
122    target: *mut icu_ffi::UConverter,
123    pivot_buffer: &'pivot mut [MaybeUninit<u16>],
124    pivot_source: *mut u16,
125    pivot_target: *mut u16,
126    reset: bool,
127}
128
129impl Drop for Converter<'_> {
130    fn drop(&mut self) {
131        let f = assume_loaded();
132        unsafe { (f.ucnv_close)(self.source) };
133        unsafe { (f.ucnv_close)(self.target) };
134    }
135}
136
137impl<'pivot> Converter<'pivot> {
138    /// Constructs a new `Converter` instance.
139    ///
140    /// # Parameters
141    ///
142    /// * `pivot_buffer`: A buffer used to cache partial conversions.
143    ///   Don't make it too small.
144    /// * `source_encoding`: The source encoding name (e.g., "UTF-8").
145    /// * `target_encoding`: The target encoding name (e.g., "UTF-16").
146    pub fn new(
147        pivot_buffer: &'pivot mut [MaybeUninit<u16>],
148        source_encoding: &str,
149        target_encoding: &str,
150    ) -> apperr::Result<Self> {
151        let f = init_if_needed()?;
152
153        let arena = scratch_arena(None);
154        let source_encoding = Self::append_nul(&arena, source_encoding);
155        let target_encoding = Self::append_nul(&arena, target_encoding);
156
157        let mut status = icu_ffi::U_ZERO_ERROR;
158        let source = unsafe { (f.ucnv_open)(source_encoding.as_ptr(), &mut status) };
159        let target = unsafe { (f.ucnv_open)(target_encoding.as_ptr(), &mut status) };
160        if status.is_failure() {
161            if !source.is_null() {
162                unsafe { (f.ucnv_close)(source) };
163            }
164            if !target.is_null() {
165                unsafe { (f.ucnv_close)(target) };
166            }
167            return Err(status.as_error());
168        }
169
170        let pivot_source = pivot_buffer.as_mut_ptr() as *mut u16;
171        let pivot_target = unsafe { pivot_source.add(pivot_buffer.len()) };
172
173        Ok(Self { source, target, pivot_buffer, pivot_source, pivot_target, reset: true })
174    }
175
176    fn append_nul<'a>(arena: &'a Arena, input: &str) -> ArenaString<'a> {
177        arena_format!(arena, "{}\0", input)
178    }
179
180    /// Performs one step of the encoding conversion.
181    ///
182    /// # Parameters
183    ///
184    /// * `input`: The input buffer to convert from.
185    ///   It should be in the `source_encoding` that was previously specified.
186    /// * `output`: The output buffer to convert to.
187    ///   It should be in the `target_encoding` that was previously specified.
188    ///
189    /// # Returns
190    ///
191    /// A tuple containing:
192    /// 1. The number of bytes read from the input buffer.
193    /// 2. The number of bytes written to the output buffer.
194    pub fn convert(
195        &mut self,
196        input: &[u8],
197        output: &mut [MaybeUninit<u8>],
198    ) -> apperr::Result<(usize, usize)> {
199        let f = assume_loaded();
200
201        let input_beg = input.as_ptr();
202        let input_end = unsafe { input_beg.add(input.len()) };
203        let mut input_ptr = input_beg;
204
205        let output_beg = output.as_mut_ptr() as *mut u8;
206        let output_end = unsafe { output_beg.add(output.len()) };
207        let mut output_ptr = output_beg;
208
209        let pivot_beg = self.pivot_buffer.as_mut_ptr() as *mut u16;
210        let pivot_end = unsafe { pivot_beg.add(self.pivot_buffer.len()) };
211
212        let flush = input.is_empty();
213        let mut status = icu_ffi::U_ZERO_ERROR;
214
215        unsafe {
216            (f.ucnv_convertEx)(
217                /* target_cnv   */ self.target,
218                /* source_cnv   */ self.source,
219                /* target       */ &mut output_ptr,
220                /* target_limit */ output_end,
221                /* source       */ &mut input_ptr,
222                /* source_limit */ input_end,
223                /* pivot_start  */ pivot_beg,
224                /* pivot_source */ &mut self.pivot_source,
225                /* pivot_target */ &mut self.pivot_target,
226                /* pivot_limit  */ pivot_end,
227                /* reset        */ self.reset,
228                /* flush        */ flush,
229                /* status       */ &mut status,
230            );
231        }
232
233        self.reset = false;
234        if status.is_failure() && status != icu_ffi::U_BUFFER_OVERFLOW_ERROR {
235            return Err(status.as_error());
236        }
237
238        let input_advance = unsafe { input_ptr.offset_from(input_beg) as usize };
239        let output_advance = unsafe { output_ptr.offset_from(output_beg) as usize };
240        Ok((input_advance, output_advance))
241    }
242}
243
244// In benchmarking, I found that the performance does not really change much by changing this value.
245// I picked 64 because it seemed like a reasonable lower bound.
246const CACHE_SIZE: usize = 64;
247
248/// Caches a chunk of TextBuffer contents (UTF-8) in UTF-16 format.
249#[repr(C)]
250struct Cache {
251    /// The translated text. Contains [`Cache::utf16_len`]-many valid items.
252    utf16: [u16; CACHE_SIZE],
253    /// For each character in [`Cache::utf16`] this stores the offset in the [`TextBuffer`],
254    /// relative to the start offset stored in `native_beg`.
255    /// This has the same length as [`Cache::utf16`].
256    utf16_to_utf8_offsets: [u16; CACHE_SIZE],
257    /// `utf8_to_utf16_offsets[native_offset - native_beg]` will tell you which character in
258    /// [`Cache::utf16`] maps to the given `native_offset` in the underlying [`TextBuffer`].
259    /// Contains `native_end - native_beg`-many valid items.
260    utf8_to_utf16_offsets: [u16; CACHE_SIZE],
261
262    /// The number of valid items in [`Cache::utf16`].
263    utf16_len: usize,
264    /// Offset of the first non-ASCII character.
265    /// Less than or equal to [`Cache::utf16_len`].
266    native_indexing_limit: usize,
267
268    /// The range of UTF-8 text in the [`TextBuffer`] that this chunk covers.
269    utf8_range: Range<usize>,
270}
271
272#[repr(C)]
273struct DoubleCache {
274    cache: [Cache; 2],
275    /// You can consider this a 1 bit index into `cache`.
276    mru: bool,
277}
278
279/// A wrapper around ICU's `UText` struct.
280///
281/// In our case its only purpose is to adapt a [`TextBuffer`] for ICU.
282///
283/// # Safety
284///
285/// Warning! No lifetime tracking is done here.
286/// I initially did it properly with a PhantomData marker for the TextBuffer
287/// lifetime, but it was a pain so now I don't. Not a big deal in our case.
288pub struct Text(&'static mut icu_ffi::UText);
289
290impl Drop for Text {
291    fn drop(&mut self) {
292        let f = assume_loaded();
293        unsafe { (f.utext_close)(self.0) };
294    }
295}
296
297impl Text {
298    /// Constructs an ICU `UText` instance from a [`TextBuffer`].
299    ///
300    /// # Safety
301    ///
302    /// The caller must ensure that the given [`TextBuffer`]
303    /// outlives the returned `Text` instance.
304    pub unsafe fn new(tb: &TextBuffer) -> apperr::Result<Self> {
305        let f = init_if_needed()?;
306
307        let mut status = icu_ffi::U_ZERO_ERROR;
308        let ptr =
309            unsafe { (f.utext_setup)(null_mut(), size_of::<DoubleCache>() as i32, &mut status) };
310        if status.is_failure() {
311            return Err(status.as_error());
312        }
313
314        const FUNCS: icu_ffi::UTextFuncs = icu_ffi::UTextFuncs {
315            table_size: size_of::<icu_ffi::UTextFuncs>() as i32,
316            reserved1: 0,
317            reserved2: 0,
318            reserved3: 0,
319            clone: Some(utext_clone),
320            native_length: Some(utext_native_length),
321            access: Some(utext_access),
322            extract: None,
323            replace: None,
324            copy: None,
325            map_offset_to_native: Some(utext_map_offset_to_native),
326            map_native_index_to_utf16: Some(utext_map_native_index_to_utf16),
327            close: None,
328            spare1: None,
329            spare2: None,
330            spare3: None,
331        };
332
333        let ut = unsafe { &mut *ptr };
334        ut.p_funcs = &FUNCS;
335        ut.context = tb as *const TextBuffer as *mut _;
336        ut.a = -1;
337
338        Ok(Self(ut))
339    }
340}
341
342fn text_buffer_from_utext<'a>(ut: &icu_ffi::UText) -> &'a TextBuffer {
343    unsafe { &*(ut.context as *const TextBuffer) }
344}
345
346fn double_cache_from_utext<'a>(ut: &icu_ffi::UText) -> &'a mut DoubleCache {
347    unsafe { &mut *(ut.p_extra as *mut DoubleCache) }
348}
349
350extern "C" fn utext_clone(
351    dest: *mut icu_ffi::UText,
352    src: &icu_ffi::UText,
353    deep: bool,
354    status: &mut icu_ffi::UErrorCode,
355) -> *mut icu_ffi::UText {
356    if status.is_failure() {
357        return null_mut();
358    }
359
360    if deep {
361        *status = icu_ffi::U_UNSUPPORTED_ERROR;
362        return null_mut();
363    }
364
365    let f = assume_loaded();
366    let ut_ptr = unsafe { (f.utext_setup)(dest, size_of::<DoubleCache>() as i32, status) };
367    if status.is_failure() {
368        return null_mut();
369    }
370
371    // TODO: I'm somewhat unsure whether we have to preserve the `chunk_offset`.
372    // We can't blindly copy chunk contents and the `Cache` in `ut.p_extra`,
373    // because they may contain dirty contents (different `TextBuffer` generation).
374    unsafe {
375        let ut = &mut *ut_ptr;
376        ut.p_funcs = src.p_funcs;
377        ut.context = src.context;
378        ut.a = -1;
379    }
380
381    ut_ptr
382}
383
384extern "C" fn utext_native_length(ut: &mut icu_ffi::UText) -> i64 {
385    let tb = text_buffer_from_utext(ut);
386    tb.text_length() as i64
387}
388
389extern "C" fn utext_access(ut: &mut icu_ffi::UText, native_index: i64, forward: bool) -> bool {
390    if let Some(cache) = utext_access_impl(ut, native_index, forward) {
391        let native_off = native_index as usize - cache.utf8_range.start;
392        ut.chunk_contents = cache.utf16.as_ptr();
393        ut.chunk_length = cache.utf16_len as i32;
394        ut.chunk_offset = cache.utf8_to_utf16_offsets[native_off] as i32;
395        ut.chunk_native_start = cache.utf8_range.start as i64;
396        ut.chunk_native_limit = cache.utf8_range.end as i64;
397        ut.native_indexing_limit = cache.native_indexing_limit as i32;
398        true
399    } else {
400        false
401    }
402}
403
404fn utext_access_impl<'a>(
405    ut: &mut icu_ffi::UText,
406    native_index: i64,
407    forward: bool,
408) -> Option<&'a mut Cache> {
409    let tb = text_buffer_from_utext(ut);
410    let mut index_contained = native_index;
411
412    if !forward {
413        index_contained -= 1;
414    }
415    if index_contained < 0 || index_contained as usize >= tb.text_length() {
416        return None;
417    }
418
419    let index_contained = index_contained as usize;
420    let native_index = native_index as usize;
421    let double_cache = double_cache_from_utext(ut);
422    let dirty = ut.a != tb.generation() as i64;
423
424    if dirty {
425        // The text buffer contents have changed.
426        // Invalidate both caches so that future calls don't mistakenly use them
427        // when they enter the for loop in the else branch below (`dirty == false`).
428        double_cache.cache[0].utf16_len = 0;
429        double_cache.cache[1].utf16_len = 0;
430        double_cache.cache[0].utf8_range = 0..0;
431        double_cache.cache[1].utf8_range = 0..0;
432        ut.a = tb.generation() as i64;
433    } else {
434        // Check if one of the caches already contains the requested range.
435        for (i, cache) in double_cache.cache.iter_mut().enumerate() {
436            if cache.utf8_range.contains(&index_contained) {
437                double_cache.mru = i != 0;
438                return Some(cache);
439            }
440        }
441    }
442
443    // Turn the least recently used cache into the most recently used one.
444    let double_cache = double_cache_from_utext(ut);
445    double_cache.mru = !double_cache.mru;
446    let cache = &mut double_cache.cache[double_cache.mru as usize];
447
448    // In order to safely fit any UTF-8 character into our cache,
449    // we must assume the worst case of a 4-byte long encoding.
450    const UTF16_LEN_LIMIT: usize = CACHE_SIZE - 4;
451    let utf8_len_limit;
452    let native_start;
453
454    if forward {
455        utf8_len_limit = (tb.text_length() - native_index).min(UTF16_LEN_LIMIT);
456        native_start = native_index;
457    } else {
458        // The worst case ratio for UTF-8 to UTF-16 is 1:1, when the text is ASCII.
459        // This allows us to safely subtract the UTF-16 buffer size
460        // and assume that whatever we read as UTF-8 will fit.
461        // TODO: Test what happens if you have lots of invalid UTF-8 text blow up to U+FFFD.
462        utf8_len_limit = native_index.min(UTF16_LEN_LIMIT);
463
464        // Since simply subtracting an offset may end up in the middle of a codepoint sequence,
465        // we must align the offset to the next codepoint boundary.
466        // Here we skip trail bytes until we find a lead.
467        let mut beg = native_index - utf8_len_limit;
468        let chunk = tb.read_forward(beg);
469        for &c in chunk {
470            if c & 0b1100_0000 != 0b1000_0000 {
471                break;
472            }
473            beg += 1;
474        }
475
476        native_start = beg;
477    }
478
479    // Translate the given range from UTF-8 to UTF-16.
480    // NOTE: This code makes the assumption that the `native_index` is always
481    // at UTF-8 codepoint boundaries which technically isn't guaranteed.
482    let mut utf16_len = 0;
483    let mut utf8_len = 0;
484    let mut ascii_len = 0;
485    'outer: loop {
486        let initial_utf8_len = utf8_len;
487        let chunk = tb.read_forward(native_start + utf8_len);
488        if chunk.is_empty() {
489            break;
490        }
491
492        let mut it = Utf8Chars::new(chunk, 0);
493
494        // If we've only seen ASCII so far we can fast-pass the UTF-16 translation,
495        // because we can just widen from u8 -> u16.
496        if utf16_len == ascii_len {
497            let haystack = &chunk[..chunk.len().min(utf8_len_limit - ascii_len)];
498
499            // When it comes to performance, and the search space is small (which it is here),
500            // it's always a good idea to keep the loops small and tight...
501            let len = haystack.iter().position(|&c| c >= 0x80).unwrap_or(haystack.len());
502
503            // ...In this case it allows the compiler to vectorize this loop and double
504            // the performance. Luckily, llvm doesn't unroll the loop, which is great,
505            // because `len` will always be a relatively small number.
506            for &c in &chunk[..len] {
507                unsafe {
508                    *cache.utf16.get_unchecked_mut(ascii_len) = c as u16;
509                    *cache.utf16_to_utf8_offsets.get_unchecked_mut(ascii_len) = ascii_len as u16;
510                    *cache.utf8_to_utf16_offsets.get_unchecked_mut(ascii_len) = ascii_len as u16;
511                }
512                ascii_len += 1;
513            }
514
515            utf16_len += len;
516            utf8_len += len;
517            it.seek(len);
518            if ascii_len >= UTF16_LEN_LIMIT {
519                break;
520            }
521        }
522
523        loop {
524            let Some(c) = it.next() else {
525                break;
526            };
527
528            // Thanks to our `if utf16_len >= UTF16_LEN_LIMIT` check,
529            // we can safely assume that this will fit.
530            unsafe {
531                let utf8_len_beg = utf8_len;
532                let utf8_len_end = initial_utf8_len + it.offset();
533
534                while utf8_len < utf8_len_end {
535                    *cache.utf8_to_utf16_offsets.get_unchecked_mut(utf8_len) = utf16_len as u16;
536                    utf8_len += 1;
537                }
538
539                if c <= '\u{FFFF}' {
540                    *cache.utf16.get_unchecked_mut(utf16_len) = c as u16;
541                    *cache.utf16_to_utf8_offsets.get_unchecked_mut(utf16_len) = utf8_len_beg as u16;
542                    utf16_len += 1;
543                } else {
544                    let c = c as u32 - 0x10000;
545                    let b = utf8_len_beg as u16;
546                    *cache.utf16.get_unchecked_mut(utf16_len) = (c >> 10) as u16 | 0xD800;
547                    *cache.utf16.get_unchecked_mut(utf16_len + 1) = (c & 0x3FF) as u16 | 0xDC00;
548                    *cache.utf16_to_utf8_offsets.get_unchecked_mut(utf16_len) = b;
549                    *cache.utf16_to_utf8_offsets.get_unchecked_mut(utf16_len + 1) = b;
550                    utf16_len += 2;
551                }
552            }
553
554            if utf16_len >= UTF16_LEN_LIMIT || utf8_len >= utf8_len_limit {
555                break 'outer;
556            }
557        }
558    }
559
560    // Allow for looking up past-the-end indices via
561    // `utext_map_offset_to_native` and `utext_map_native_index_to_utf16`.
562    cache.utf16_to_utf8_offsets[utf16_len] = utf8_len as u16;
563    cache.utf8_to_utf16_offsets[utf8_len] = utf16_len as u16;
564
565    let native_limit = native_start + utf8_len;
566    cache.utf16_len = utf16_len;
567    // If parts of the UTF-8 chunk are ASCII, we can tell ICU that it doesn't need to call
568    // utext_map_offset_to_native. For some reason, uregex calls that function *a lot*,
569    // literally half the CPU time is spent on it.
570    cache.native_indexing_limit = ascii_len;
571    cache.utf8_range = native_start..native_limit;
572    Some(cache)
573}
574
575extern "C" fn utext_map_offset_to_native(ut: &icu_ffi::UText) -> i64 {
576    debug_assert!((0..=ut.chunk_length).contains(&ut.chunk_offset));
577
578    let double_cache = double_cache_from_utext(ut);
579    let cache = &double_cache.cache[double_cache.mru as usize];
580    let off_rel = cache.utf16_to_utf8_offsets[ut.chunk_offset as usize];
581    let off_abs = cache.utf8_range.start + off_rel as usize;
582    off_abs as i64
583}
584
585extern "C" fn utext_map_native_index_to_utf16(ut: &icu_ffi::UText, native_index: i64) -> i32 {
586    debug_assert!((ut.chunk_native_start..=ut.chunk_native_limit).contains(&native_index));
587
588    let double_cache = double_cache_from_utext(ut);
589    let cache = &double_cache.cache[double_cache.mru as usize];
590    let off_rel = cache.utf8_to_utf16_offsets[(native_index - ut.chunk_native_start) as usize];
591    off_rel as i32
592}
593
594/// A wrapper around ICU's `URegularExpression` struct.
595///
596/// # Safety
597///
598/// Warning! No lifetime tracking is done here.
599pub struct Regex(&'static mut icu_ffi::URegularExpression);
600
601impl Drop for Regex {
602    fn drop(&mut self) {
603        let f = assume_loaded();
604        unsafe { (f.uregex_close)(self.0) };
605    }
606}
607
608impl Regex {
609    /// Enable case-insensitive matching.
610    pub const CASE_INSENSITIVE: i32 = icu_ffi::UREGEX_CASE_INSENSITIVE;
611
612    /// If set, ^ and $ match the start and end of each line.
613    /// Otherwise, they match the start and end of the entire string.
614    pub const MULTILINE: i32 = icu_ffi::UREGEX_MULTILINE;
615
616    /// Treat the given pattern as a literal string.
617    pub const LITERAL: i32 = icu_ffi::UREGEX_LITERAL;
618
619    /// Constructs a regex, plain and simple. Read `uregex_open` docs.
620    ///
621    /// # Safety
622    ///
623    /// The caller must ensure that the given `Text` outlives the returned `Regex` instance.
624    pub unsafe fn new(pattern: &str, flags: i32, text: &Text) -> apperr::Result<Self> {
625        let f = init_if_needed()?;
626        unsafe {
627            let scratch = scratch_arena(None);
628            let mut utf16 = Vec::new_in(&*scratch);
629            let mut status = icu_ffi::U_ZERO_ERROR;
630
631            utf16.extend(pattern.encode_utf16());
632
633            let ptr = (f.uregex_open)(
634                utf16.as_ptr(),
635                utf16.len() as i32,
636                icu_ffi::UREGEX_MULTILINE | icu_ffi::UREGEX_ERROR_ON_UNKNOWN_ESCAPES | flags,
637                None,
638                &mut status,
639            );
640            // ICU describes the time unit as being dependent on CPU performance
641            // and "typically [in] the order of milliseconds", but this claim seems
642            // highly outdated. On my CPU from 2021, a limit of 4096 equals roughly 600ms.
643            (f.uregex_setTimeLimit)(ptr, 4096, &mut status);
644            (f.uregex_setUText)(ptr, text.0 as *const _ as *mut _, &mut status);
645            if status.is_failure() {
646                return Err(status.as_error());
647            }
648
649            Ok(Self(&mut *ptr))
650        }
651    }
652
653    /// Updates the regex pattern with the given text.
654    /// If the text contents have changed, you can pass the same text as you used
655    /// initially and it'll trigger ICU to reload the text and invalidate its caches.
656    ///
657    /// # Safety
658    ///
659    /// The caller must ensure that the given `Text` outlives the `Regex` instance.
660    pub unsafe fn set_text(&mut self, text: &mut Text, offset: usize) {
661        // Get `utext_access_impl` to detect the `TextBuffer::generation` change,
662        // and refresh its contents. This ensures that ICU doesn't reuse
663        // stale `UText::chunk_contents`, as it has no way tell that it's stale.
664        utext_access(text.0, offset as i64, true);
665
666        let f = assume_loaded();
667        let mut status = icu_ffi::U_ZERO_ERROR;
668        unsafe { (f.uregex_setUText)(self.0, text.0 as *const _ as *mut _, &mut status) };
669        // `uregex_setUText` resets the regex to the start of the text.
670        // Because of this, we must also call `uregex_reset64`.
671        unsafe { (f.uregex_reset64)(self.0, offset as i64, &mut status) };
672    }
673
674    /// Sets the regex to the absolute offset in the underlying text.
675    pub fn reset(&mut self, offset: usize) {
676        let f = assume_loaded();
677        let mut status = icu_ffi::U_ZERO_ERROR;
678        unsafe { (f.uregex_reset64)(self.0, offset as i64, &mut status) };
679    }
680}
681
682impl Iterator for Regex {
683    type Item = Range<usize>;
684
685    fn next(&mut self) -> Option<Self::Item> {
686        let f = assume_loaded();
687
688        let mut status = icu_ffi::U_ZERO_ERROR;
689        let ok = unsafe { (f.uregex_findNext)(self.0, &mut status) };
690        if !ok {
691            return None;
692        }
693
694        let start = unsafe { (f.uregex_start64)(self.0, 0, &mut status) };
695        let end = unsafe { (f.uregex_end64)(self.0, 0, &mut status) };
696        if status.is_failure() {
697            return None;
698        }
699
700        let start = start.max(0);
701        let end = end.max(start);
702        Some(start as usize..end as usize)
703    }
704}
705
706static mut ROOT_COLLATOR: Option<*mut icu_ffi::UCollator> = None;
707
708/// Compares two UTF-8 strings for sorting using ICU's collation algorithm.
709pub fn compare_strings(a: &[u8], b: &[u8]) -> Ordering {
710    #[cold]
711    fn init() {
712        unsafe {
713            let mut coll = null_mut();
714
715            if let Ok(f) = init_if_needed() {
716                let mut status = icu_ffi::U_ZERO_ERROR;
717                coll = (f.ucol_open)(c"".as_ptr(), &mut status);
718            }
719
720            ROOT_COLLATOR = Some(coll);
721        }
722    }
723
724    // OnceCell for people that want to put it into a static.
725    #[allow(static_mut_refs)]
726    let coll = unsafe {
727        if ROOT_COLLATOR.is_none() {
728            init();
729        }
730        ROOT_COLLATOR.unwrap_unchecked()
731    };
732
733    if coll.is_null() {
734        compare_strings_ascii(a, b)
735    } else {
736        let f = assume_loaded();
737        let mut status = icu_ffi::U_ZERO_ERROR;
738        let res = unsafe {
739            (f.ucol_strcollUTF8)(
740                coll,
741                a.as_ptr(),
742                a.len() as i32,
743                b.as_ptr(),
744                b.len() as i32,
745                &mut status,
746            )
747        };
748
749        match res {
750            icu_ffi::UCollationResult::UCOL_EQUAL => Ordering::Equal,
751            icu_ffi::UCollationResult::UCOL_GREATER => Ordering::Greater,
752            icu_ffi::UCollationResult::UCOL_LESS => Ordering::Less,
753        }
754    }
755}
756
757/// Unicode collation via `ucol_strcollUTF8`, now for ASCII!
758fn compare_strings_ascii(a: &[u8], b: &[u8]) -> Ordering {
759    let mut iter = a.iter().zip(b.iter());
760
761    // Low weight: Find the first character which differs.
762    //
763    // Remember that result in case all remaining characters are
764    // case-insensitive equal, because then we use that as a fallback.
765    while let Some((&a, &b)) = iter.next() {
766        if a != b {
767            let mut order = a.cmp(&b);
768            let la = a.to_ascii_lowercase();
769            let lb = b.to_ascii_lowercase();
770
771            if la == lb {
772                // High weight: Find the first character which
773                // differs case-insensitively.
774                for (a, b) in iter {
775                    let la = a.to_ascii_lowercase();
776                    let lb = b.to_ascii_lowercase();
777
778                    if la != lb {
779                        order = la.cmp(&lb);
780                        break;
781                    }
782                }
783            }
784
785            return order;
786        }
787    }
788
789    // Fallback: The shorter string wins.
790    a.len().cmp(&b.len())
791}
792
793static mut ROOT_CASEMAP: Option<*mut icu_ffi::UCaseMap> = None;
794
795/// Converts the given UTF-8 string to lower case.
796///
797/// Case folding differs from lower case in that the output is primarily useful
798/// to machines for comparisons. It's like applying Unicode normalization.
799pub fn fold_case<'a>(arena: &'a Arena, input: &str) -> ArenaString<'a> {
800    // OnceCell for people that want to put it into a static.
801    #[allow(static_mut_refs)]
802    let casemap = unsafe {
803        if ROOT_CASEMAP.is_none() {
804            ROOT_CASEMAP = Some(if let Ok(f) = init_if_needed() {
805                let mut status = icu_ffi::U_ZERO_ERROR;
806                (f.ucasemap_open)(null(), 0, &mut status)
807            } else {
808                null_mut()
809            })
810        }
811        ROOT_CASEMAP.unwrap_unchecked()
812    };
813
814    if !casemap.is_null() {
815        let f = assume_loaded();
816        let mut status = icu_ffi::U_ZERO_ERROR;
817        let mut output = Vec::new_in(arena);
818        let mut output_len;
819
820        // First, guess the output length:
821        // TODO: What's a good heuristic here?
822        {
823            output.reserve_exact(input.len() + 16);
824            let output = output.spare_capacity_mut();
825            output_len = unsafe {
826                (f.ucasemap_utf8FoldCase)(
827                    casemap,
828                    output.as_mut_ptr() as *mut _,
829                    output.len() as i32,
830                    input.as_ptr() as *const _,
831                    input.len() as i32,
832                    &mut status,
833                )
834            };
835        }
836
837        // If that failed to fit, retry with the correct length.
838        if status == icu_ffi::U_BUFFER_OVERFLOW_ERROR && output_len > 0 {
839            output.reserve_exact(output_len as usize);
840            let output = output.spare_capacity_mut();
841            output_len = unsafe {
842                (f.ucasemap_utf8FoldCase)(
843                    casemap,
844                    output.as_mut_ptr() as *mut _,
845                    output.len() as i32,
846                    input.as_ptr() as *const _,
847                    input.len() as i32,
848                    &mut status,
849                )
850            };
851        }
852
853        if status.is_success() && output_len > 0 {
854            unsafe {
855                output.set_len(output_len as usize);
856            }
857            return unsafe { ArenaString::from_utf8_unchecked(output) };
858        }
859    }
860
861    let mut result = ArenaString::from_str(arena, input);
862    for b in unsafe { result.as_bytes_mut() } {
863        b.make_ascii_lowercase();
864    }
865    result
866}
867
868// NOTE:
869// To keep this neat, fields are ordered by prefix (= `ucol_` before `uregex_`),
870// followed by functions in this order:
871// * Static methods (e.g. `ucnv_getAvailableName`)
872// * Constructors (e.g. `ucnv_open`)
873// * Destructors (e.g. `ucnv_close`)
874// * Methods, grouped by relationship
875//   (e.g. `uregex_start64` and `uregex_end64` are near each other)
876//
877// WARNING:
878// The order of the fields MUST match the order of strings in the following two arrays.
879#[allow(non_snake_case)]
880#[repr(C)]
881struct LibraryFunctions {
882    // LIBICUUC_PROC_NAMES
883    u_errorName: icu_ffi::u_errorName,
884    ucasemap_open: icu_ffi::ucasemap_open,
885    ucasemap_utf8FoldCase: icu_ffi::ucasemap_utf8FoldCase,
886    ucnv_getAvailableName: icu_ffi::ucnv_getAvailableName,
887    ucnv_getStandardName: icu_ffi::ucnv_getStandardName,
888    ucnv_open: icu_ffi::ucnv_open,
889    ucnv_close: icu_ffi::ucnv_close,
890    ucnv_convertEx: icu_ffi::ucnv_convertEx,
891    utext_setup: icu_ffi::utext_setup,
892    utext_close: icu_ffi::utext_close,
893
894    // LIBICUI18N_PROC_NAMES
895    ucol_open: icu_ffi::ucol_open,
896    ucol_strcollUTF8: icu_ffi::ucol_strcollUTF8,
897    uregex_open: icu_ffi::uregex_open,
898    uregex_close: icu_ffi::uregex_close,
899    uregex_setTimeLimit: icu_ffi::uregex_setTimeLimit,
900    uregex_setUText: icu_ffi::uregex_setUText,
901    uregex_reset64: icu_ffi::uregex_reset64,
902    uregex_findNext: icu_ffi::uregex_findNext,
903    uregex_start64: icu_ffi::uregex_start64,
904    uregex_end64: icu_ffi::uregex_end64,
905}
906
907// Found in libicuuc.so on UNIX, icuuc.dll/icu.dll on Windows.
908const LIBICUUC_PROC_NAMES: [&CStr; 10] = [
909    c"u_errorName",
910    c"ucasemap_open",
911    c"ucasemap_utf8FoldCase",
912    c"ucnv_getAvailableName",
913    c"ucnv_getStandardName",
914    c"ucnv_open",
915    c"ucnv_close",
916    c"ucnv_convertEx",
917    c"utext_setup",
918    c"utext_close",
919];
920
921// Found in libicui18n.so on UNIX, icuin.dll/icu.dll on Windows.
922const LIBICUI18N_PROC_NAMES: [&CStr; 10] = [
923    c"ucol_open",
924    c"ucol_strcollUTF8",
925    c"uregex_open",
926    c"uregex_close",
927    c"uregex_setTimeLimit",
928    c"uregex_setUText",
929    c"uregex_reset64",
930    c"uregex_findNext",
931    c"uregex_start64",
932    c"uregex_end64",
933];
934
935enum LibraryFunctionsState {
936    Uninitialized,
937    Failed,
938    Loaded(LibraryFunctions),
939}
940
941static mut LIBRARY_FUNCTIONS: LibraryFunctionsState = LibraryFunctionsState::Uninitialized;
942
943pub fn init() -> apperr::Result<()> {
944    init_if_needed()?;
945    Ok(())
946}
947
948#[allow(static_mut_refs)]
949fn init_if_needed() -> apperr::Result<&'static LibraryFunctions> {
950    #[cold]
951    fn load() {
952        unsafe {
953            LIBRARY_FUNCTIONS = LibraryFunctionsState::Failed;
954
955            let Ok(libicuuc) = sys::load_libicuuc() else {
956                return;
957            };
958            let Ok(libicui18n) = sys::load_libicui18n() else {
959                return;
960            };
961
962            type TransparentFunction = unsafe extern "C" fn() -> *const ();
963
964            // OH NO I'M DOING A BAD THING
965            //
966            // If this assertion hits, you either forgot to update `LIBRARY_PROC_NAMES`
967            // or you're on a platform where `dlsym` behaves different from classic UNIX and Windows.
968            //
969            // This code assumes that we can treat the `LibraryFunctions` struct containing various different function
970            // pointers as an array of `TransparentFunction` pointers. In C, this works on any platform that supports
971            // POSIX `dlsym` or equivalent, but I suspect Rust is once again being extra about it. In any case, that's
972            // still better than loading every function one by one, just to blow up our binary size for no reason.
973            const _: () = assert!(
974                mem::size_of::<LibraryFunctions>()
975                    == mem::size_of::<TransparentFunction>()
976                        * (LIBICUUC_PROC_NAMES.len() + LIBICUI18N_PROC_NAMES.len())
977            );
978
979            let mut funcs = MaybeUninit::<LibraryFunctions>::uninit();
980            let mut ptr = funcs.as_mut_ptr() as *mut TransparentFunction;
981
982            #[cfg(unix)]
983            let scratch_outer = scratch_arena(None);
984            #[cfg(unix)]
985            let suffix = sys::icu_proc_suffix(&scratch_outer, libicuuc);
986
987            for (handle, names) in
988                [(libicuuc, &LIBICUUC_PROC_NAMES[..]), (libicui18n, &LIBICUI18N_PROC_NAMES[..])]
989            {
990                for name in names {
991                    #[cfg(unix)]
992                    let scratch = scratch_arena(Some(&scratch_outer));
993                    #[cfg(unix)]
994                    let name = &sys::add_icu_proc_suffix(&scratch, name, &suffix);
995
996                    let Ok(func) = sys::get_proc_address(handle, name) else {
997                        debug_assert!(
998                            false,
999                            "Failed to load ICU function: {}",
1000                            name.to_string_lossy()
1001                        );
1002                        return;
1003                    };
1004
1005                    ptr.write(func);
1006                    ptr = ptr.add(1);
1007                }
1008            }
1009
1010            LIBRARY_FUNCTIONS = LibraryFunctionsState::Loaded(funcs.assume_init());
1011        }
1012    }
1013
1014    unsafe {
1015        if matches!(&LIBRARY_FUNCTIONS, LibraryFunctionsState::Uninitialized) {
1016            load();
1017        }
1018    }
1019
1020    match unsafe { &LIBRARY_FUNCTIONS } {
1021        LibraryFunctionsState::Loaded(f) => Ok(f),
1022        _ => Err(apperr::APP_ICU_MISSING),
1023    }
1024}
1025
1026#[allow(static_mut_refs)]
1027fn assume_loaded() -> &'static LibraryFunctions {
1028    match unsafe { &LIBRARY_FUNCTIONS } {
1029        LibraryFunctionsState::Loaded(f) => f,
1030        _ => unreachable!(),
1031    }
1032}
1033
1034mod icu_ffi {
1035    #![allow(dead_code, non_camel_case_types)]
1036
1037    use std::ffi::{c_char, c_int, c_void};
1038
1039    use crate::apperr;
1040
1041    #[derive(Copy, Clone, Eq, PartialEq)]
1042    #[repr(transparent)]
1043    pub struct UErrorCode(c_int);
1044
1045    impl UErrorCode {
1046        pub const fn new(code: u32) -> Self {
1047            Self(code as c_int)
1048        }
1049
1050        pub fn is_success(&self) -> bool {
1051            self.0 <= 0
1052        }
1053
1054        pub fn is_failure(&self) -> bool {
1055            self.0 > 0
1056        }
1057
1058        pub fn as_error(&self) -> apperr::Error {
1059            debug_assert!(self.0 > 0);
1060            apperr::Error::new_icu(self.0 as u32)
1061        }
1062    }
1063
1064    pub const U_ZERO_ERROR: UErrorCode = UErrorCode(0);
1065    pub const U_BUFFER_OVERFLOW_ERROR: UErrorCode = UErrorCode(15);
1066    pub const U_UNSUPPORTED_ERROR: UErrorCode = UErrorCode(16);
1067
1068    pub type u_errorName = unsafe extern "C" fn(code: UErrorCode) -> *const c_char;
1069
1070    pub struct UConverter;
1071
1072    pub type ucnv_getAvailableName = unsafe extern "C" fn(n: i32) -> *const c_char;
1073
1074    pub type ucnv_getStandardName = unsafe extern "C" fn(
1075        name: *const u8,
1076        standard: *const u8,
1077        status: &mut UErrorCode,
1078    ) -> *const c_char;
1079
1080    pub type ucnv_open =
1081        unsafe extern "C" fn(converter_name: *const u8, status: &mut UErrorCode) -> *mut UConverter;
1082
1083    pub type ucnv_close = unsafe extern "C" fn(converter: *mut UConverter);
1084
1085    pub type ucnv_convertEx = unsafe extern "C" fn(
1086        target_cnv: *mut UConverter,
1087        source_cnv: *mut UConverter,
1088        target: *mut *mut u8,
1089        target_limit: *const u8,
1090        source: *mut *const u8,
1091        source_limit: *const u8,
1092        pivot_start: *mut u16,
1093        pivot_source: *mut *mut u16,
1094        pivot_target: *mut *mut u16,
1095        pivot_limit: *const u16,
1096        reset: bool,
1097        flush: bool,
1098        status: &mut UErrorCode,
1099    );
1100
1101    pub struct UCaseMap;
1102
1103    pub type ucasemap_open = unsafe extern "C" fn(
1104        locale: *const c_char,
1105        options: u32,
1106        status: &mut UErrorCode,
1107    ) -> *mut UCaseMap;
1108
1109    pub type ucasemap_utf8FoldCase = unsafe extern "C" fn(
1110        csm: *const UCaseMap,
1111        dest: *mut c_char,
1112        dest_capacity: i32,
1113        src: *const c_char,
1114        src_length: i32,
1115        status: &mut UErrorCode,
1116    ) -> i32;
1117
1118    #[repr(C)]
1119    pub enum UCollationResult {
1120        UCOL_EQUAL = 0,
1121        UCOL_GREATER = 1,
1122        UCOL_LESS = -1,
1123    }
1124
1125    #[repr(C)]
1126    pub struct UCollator;
1127
1128    pub type ucol_open =
1129        unsafe extern "C" fn(loc: *const c_char, status: &mut UErrorCode) -> *mut UCollator;
1130
1131    pub type ucol_strcollUTF8 = unsafe extern "C" fn(
1132        coll: *mut UCollator,
1133        source: *const u8,
1134        source_length: i32,
1135        target: *const u8,
1136        target_length: i32,
1137        status: &mut UErrorCode,
1138    ) -> UCollationResult;
1139
1140    // UText callback functions
1141    pub type UTextClone = unsafe extern "C" fn(
1142        dest: *mut UText,
1143        src: &UText,
1144        deep: bool,
1145        status: &mut UErrorCode,
1146    ) -> *mut UText;
1147    pub type UTextNativeLength = unsafe extern "C" fn(ut: &mut UText) -> i64;
1148    pub type UTextAccess =
1149        unsafe extern "C" fn(ut: &mut UText, native_index: i64, forward: bool) -> bool;
1150    pub type UTextExtract = unsafe extern "C" fn(
1151        ut: &mut UText,
1152        native_start: i64,
1153        native_limit: i64,
1154        dest: *mut u16,
1155        dest_capacity: i32,
1156        status: &mut UErrorCode,
1157    ) -> i32;
1158    pub type UTextReplace = unsafe extern "C" fn(
1159        ut: &mut UText,
1160        native_start: i64,
1161        native_limit: i64,
1162        replacement_text: *const u16,
1163        replacement_length: i32,
1164        status: &mut UErrorCode,
1165    ) -> i32;
1166    pub type UTextCopy = unsafe extern "C" fn(
1167        ut: &mut UText,
1168        native_start: i64,
1169        native_limit: i64,
1170        native_dest: i64,
1171        move_text: bool,
1172        status: &mut UErrorCode,
1173    );
1174    pub type UTextMapOffsetToNative = unsafe extern "C" fn(ut: &UText) -> i64;
1175    pub type UTextMapNativeIndexToUTF16 =
1176        unsafe extern "C" fn(ut: &UText, native_index: i64) -> i32;
1177    pub type UTextClose = unsafe extern "C" fn(ut: &mut UText);
1178
1179    #[repr(C)]
1180    pub struct UTextFuncs {
1181        pub table_size: i32,
1182        pub reserved1: i32,
1183        pub reserved2: i32,
1184        pub reserved3: i32,
1185        pub clone: Option<UTextClone>,
1186        pub native_length: Option<UTextNativeLength>,
1187        pub access: Option<UTextAccess>,
1188        pub extract: Option<UTextExtract>,
1189        pub replace: Option<UTextReplace>,
1190        pub copy: Option<UTextCopy>,
1191        pub map_offset_to_native: Option<UTextMapOffsetToNative>,
1192        pub map_native_index_to_utf16: Option<UTextMapNativeIndexToUTF16>,
1193        pub close: Option<UTextClose>,
1194        pub spare1: Option<UTextClose>,
1195        pub spare2: Option<UTextClose>,
1196        pub spare3: Option<UTextClose>,
1197    }
1198
1199    #[repr(C)]
1200    pub struct UText {
1201        pub magic: u32,
1202        pub flags: i32,
1203        pub provider_properties: i32,
1204        pub size_of_struct: i32,
1205        pub chunk_native_limit: i64,
1206        pub extra_size: i32,
1207        pub native_indexing_limit: i32,
1208        pub chunk_native_start: i64,
1209        pub chunk_offset: i32,
1210        pub chunk_length: i32,
1211        pub chunk_contents: *const u16,
1212        pub p_funcs: &'static UTextFuncs,
1213        pub p_extra: *mut c_void,
1214        pub context: *mut c_void,
1215        pub p: *mut c_void,
1216        pub q: *mut c_void,
1217        pub r: *mut c_void,
1218        pub priv_p: *mut c_void,
1219        pub a: i64,
1220        pub b: i32,
1221        pub c: i32,
1222        pub priv_a: i64,
1223        pub priv_b: i32,
1224        pub priv_c: i32,
1225    }
1226
1227    pub const UTEXT_MAGIC: u32 = 0x345ad82c;
1228    pub const UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE: i32 = 1;
1229    pub const UTEXT_PROVIDER_STABLE_CHUNKS: i32 = 2;
1230    pub const UTEXT_PROVIDER_WRITABLE: i32 = 3;
1231    pub const UTEXT_PROVIDER_HAS_META_DATA: i32 = 4;
1232    pub const UTEXT_PROVIDER_OWNS_TEXT: i32 = 5;
1233
1234    pub type utext_setup = unsafe extern "C" fn(
1235        ut: *mut UText,
1236        extra_space: i32,
1237        status: &mut UErrorCode,
1238    ) -> *mut UText;
1239    pub type utext_close = unsafe extern "C" fn(ut: *mut UText) -> *mut UText;
1240
1241    #[repr(C)]
1242    pub struct UParseError {
1243        pub line: i32,
1244        pub offset: i32,
1245        pub pre_context: [u16; 16],
1246        pub post_context: [u16; 16],
1247    }
1248
1249    #[repr(C)]
1250    pub struct URegularExpression;
1251
1252    pub const UREGEX_UNIX_LINES: i32 = 1;
1253    pub const UREGEX_CASE_INSENSITIVE: i32 = 2;
1254    pub const UREGEX_COMMENTS: i32 = 4;
1255    pub const UREGEX_MULTILINE: i32 = 8;
1256    pub const UREGEX_LITERAL: i32 = 16;
1257    pub const UREGEX_DOTALL: i32 = 32;
1258    pub const UREGEX_UWORD: i32 = 256;
1259    pub const UREGEX_ERROR_ON_UNKNOWN_ESCAPES: i32 = 512;
1260
1261    pub type uregex_open = unsafe extern "C" fn(
1262        pattern: *const u16,
1263        pattern_length: i32,
1264        flags: i32,
1265        pe: Option<&mut UParseError>,
1266        status: &mut UErrorCode,
1267    ) -> *mut URegularExpression;
1268    pub type uregex_close = unsafe extern "C" fn(regexp: *mut URegularExpression);
1269    pub type uregex_setTimeLimit =
1270        unsafe extern "C" fn(regexp: *mut URegularExpression, limit: i32, status: &mut UErrorCode);
1271    pub type uregex_setUText = unsafe extern "C" fn(
1272        regexp: *mut URegularExpression,
1273        text: *mut UText,
1274        status: &mut UErrorCode,
1275    );
1276    pub type uregex_reset64 =
1277        unsafe extern "C" fn(regexp: *mut URegularExpression, index: i64, status: &mut UErrorCode);
1278    pub type uregex_findNext =
1279        unsafe extern "C" fn(regexp: *mut URegularExpression, status: &mut UErrorCode) -> bool;
1280    pub type uregex_start64 = unsafe extern "C" fn(
1281        regexp: *mut URegularExpression,
1282        group_num: i32,
1283        status: &mut UErrorCode,
1284    ) -> i64;
1285    pub type uregex_end64 = unsafe extern "C" fn(
1286        regexp: *mut URegularExpression,
1287        group_num: i32,
1288        status: &mut UErrorCode,
1289    ) -> i64;
1290}
1291
1292#[cfg(test)]
1293mod tests {
1294    use super::*;
1295
1296    #[test]
1297    fn test_compare_strings_ascii() {
1298        // Empty strings
1299        assert_eq!(compare_strings_ascii(b"", b""), Ordering::Equal);
1300        // Equal strings
1301        assert_eq!(compare_strings_ascii(b"hello", b"hello"), Ordering::Equal);
1302        // Different lengths
1303        assert_eq!(compare_strings_ascii(b"abc", b"abcd"), Ordering::Less);
1304        assert_eq!(compare_strings_ascii(b"abcd", b"abc"), Ordering::Greater);
1305        // Same chars, different cases - 1st char wins
1306        assert_eq!(compare_strings_ascii(b"AbC", b"aBc"), Ordering::Less);
1307        // Different chars, different cases - 2nd char wins, because it differs
1308        assert_eq!(compare_strings_ascii(b"hallo", b"Hello"), Ordering::Less);
1309        assert_eq!(compare_strings_ascii(b"Hello", b"hallo"), Ordering::Greater);
1310    }
1311}
edit/icu.rs

edit/
icu.rs