Skip to main content

relon_unicode/
full_case_folding.rs

1//! v3++ b-6 full Unicode case folding (UAX #21).
2//!
3//! Extends the v3+ a-4 simple folding (1:1 codepoint mappings in
4//! [`super::case_folding`]) with three additional rules drawn from
5//! UCD 14.0.0:
6//!
7//!   * **Multi-codepoint unconditional mappings** — `ß` -> `SS`, the
8//!     Latin / Armenian ligatures (`fi` -> `FI`, …), and the Greek
9//!     letters with combining marks pre-applied (`ΐ`, `ΰ`, …). Tables
10//!     live in [`FULL_UPPER_FOLDING`] and [`FULL_LOWER_FOLDING`],
11//!     generated by `tools/gen_full_case_folding.py` from
12//!     `data/SpecialCasing.txt`.
13//!
14//!   * **Greek final sigma context** — `Σ` (U+03A3) lowercases to
15//!     `ς` (U+03C2) when it appears at the end of a word (no following
16//!     cased codepoint, ignoring case-ignorable codepoints in between),
17//!     else `σ` (U+03C3). The right-scan uses [`is_cased`] and
18//!     [`is_case_ignorable`], whose backing ranges come from
19//!     `data/DerivedCoreProperties.txt`.
20//!
21//!   * **Turkish / Azerbaijani locale overrides** — the `I`/`İ` /
22//!     `ı`/`i` quartet behaves differently in `tr` and `az` locales
23//!     (no-dot dotless-i conventions). [`TURKISH_LOWER_FOLDING`] and
24//!     [`TURKISH_UPPER_FOLDING`] hold the four override entries; the
25//!     locale-aware stdlib bodies fall back to the default tables on
26//!     a miss.
27//!
28//! ### Data layout (wire format for the wasm-AOT data section)
29//!
30//! Each FULL entry is `(input_cp: u32, out0: u32, out1: u32, out2: u32,
31//! out_len: u8)` — 20 bytes serialised, with the trailing `out_len`
32//! padded to a 4-byte stride for natural alignment. Up-to-3 codepoint
33//! outputs cover every SpecialCasing.txt unconditional entry (the
34//! longest is 3 cps, e.g. `U+0390` -> `0399 0308 0301`).
35//!
36//! Ranges (`CASED_RANGES`, `CASE_IGNORABLE_RANGES`) use the same
37//! `(start, end)` inclusive shape as the existing combining-mark /
38//! whitespace tables — runtime binary search reuses the same arithmetic
39//! against an 8-byte stride.
40
41// The build script-style generated file lives next to this module so
42// the include path stays short. Re-run `tools/gen_full_case_folding.py`
43// after a UCD bump.
44include!("full_case_folding_data.rs");
45
46// ---------------------------------------------------------------------------
47// Public API
48// ---------------------------------------------------------------------------
49
50/// Public view of the full multi-codepoint upper-folding table.
51/// Sorted ascending by input codepoint.
52pub fn full_upper_folding() -> &'static [(u32, u32, u32, u32, u8)] {
53    FULL_UPPER_FOLDING
54}
55
56/// Public view of the full multi-codepoint lower-folding table.
57/// Currently a single entry (U+0130 → i + combining-dot-above).
58pub fn full_lower_folding() -> &'static [(u32, u32, u32, u32, u8)] {
59    FULL_LOWER_FOLDING
60}
61
62/// Public view of the Turkish / Azerbaijani upper-folding overrides.
63/// Used only when the caller passes a `tr` / `az` locale.
64pub fn turkish_upper_folding() -> &'static [(u32, u32, u32, u32, u8)] {
65    TURKISH_UPPER_FOLDING
66}
67
68/// Public view of the Turkish / Azerbaijani lower-folding overrides.
69pub fn turkish_lower_folding() -> &'static [(u32, u32, u32, u32, u8)] {
70    TURKISH_LOWER_FOLDING
71}
72
73/// Sorted ranges of Cased codepoints (UCD `Cased` derived property).
74pub fn cased_ranges() -> &'static [(u32, u32)] {
75    CASED_RANGES
76}
77
78/// Sorted ranges of Case_Ignorable codepoints (UCD `Case_Ignorable`
79/// derived property).
80pub fn case_ignorable_ranges() -> &'static [(u32, u32)] {
81    CASE_IGNORABLE_RANGES
82}
83
84/// `true` when `cp` carries the UCD `Cased` derived property.
85#[inline]
86pub fn is_cased(cp: u32) -> bool {
87    range_contains(CASED_RANGES, cp)
88}
89
90/// `true` when `cp` carries the UCD `Case_Ignorable` derived property.
91#[inline]
92pub fn is_case_ignorable(cp: u32) -> bool {
93    range_contains(CASE_IGNORABLE_RANGES, cp)
94}
95
96/// UAX #21 Final_Sigma context check: returns `true` when `Σ` at
97/// `cps[anchor]` is at the end of a word — no later codepoint in
98/// `cps[anchor + 1 ..]` is cased, after skipping case-ignorable
99/// codepoints in between. The argument is a slice over decoded
100/// codepoints (post-UTF-8 decode); the helper consults
101/// [`is_cased`] and [`is_case_ignorable`] internally.
102pub fn is_final_sigma_context(cps: &[u32], anchor: usize) -> bool {
103    // Need at least one cased codepoint to the left of `anchor` to
104    // count as "end of word" — otherwise the sigma is a stand-alone
105    // codepoint and the canonical rule still applies (per UAX #21,
106    // `Final_Sigma` requires `Before` = there is a preceding cased
107    // codepoint plus only case-ignorables between).
108    let mut has_cased_before = false;
109    for &cp in cps[..anchor].iter().rev() {
110        if is_cased(cp) {
111            has_cased_before = true;
112            break;
113        }
114        if !is_case_ignorable(cp) {
115            break;
116        }
117    }
118    if !has_cased_before {
119        return false;
120    }
121    // After: any cased codepoint after the sigma (ignoring case
122    // ignorables) disqualifies the final-sigma form.
123    for &cp in cps[anchor + 1..].iter() {
124        if is_cased(cp) {
125            return false;
126        }
127        if !is_case_ignorable(cp) {
128            // Non-cased, non-ignorable breaks the scan — sigma is
129            // word-final.
130            return true;
131        }
132    }
133    true
134}
135
136/// Locale resolution: returns `true` when the BCP-47-ish `locale`
137/// string selects the Turkish / Azerbaijani branch. Only the leading
138/// two ASCII characters are compared; subtags (`-TR`, `_AZ`, ...) are
139/// ignored so `tr`, `tr-TR`, `tr_TR`, `az_AZ` all match.
140#[inline]
141pub fn is_turkish_locale(locale: &str) -> bool {
142    let bytes = locale.as_bytes();
143    if bytes.len() < 2 {
144        return false;
145    }
146    let a = bytes[0].to_ascii_lowercase();
147    let b = bytes[1].to_ascii_lowercase();
148    // After the two-letter prefix the only valid follow-on for a BCP-47
149    // tag is `-`, `_`, or end-of-string.
150    let boundary_ok = bytes.len() == 2 || matches!(bytes[2], b'-' | b'_');
151    boundary_ok && ((a == b't' && b == b'r') || (a == b'a' && b == b'z'))
152}
153
154// ---------------------------------------------------------------------------
155// Helpers (private)
156// ---------------------------------------------------------------------------
157
158fn range_contains(ranges: &[(u32, u32)], cp: u32) -> bool {
159    super::cp_in_ranges(cp, ranges)
160}
161
162/// Look up a FULL fold entry by codepoint, returning `(out_len,
163/// [out0, out1, out2])`. Callers index the leading `out_len` slots.
164#[inline]
165pub fn full_upper_entry(cp: u32) -> Option<(u8, [u32; 3])> {
166    entry_for(FULL_UPPER_FOLDING, cp)
167}
168
169#[inline]
170pub fn full_lower_entry(cp: u32) -> Option<(u8, [u32; 3])> {
171    entry_for(FULL_LOWER_FOLDING, cp)
172}
173
174#[inline]
175pub fn turkish_upper_entry(cp: u32) -> Option<(u8, [u32; 3])> {
176    entry_for(TURKISH_UPPER_FOLDING, cp)
177}
178
179#[inline]
180pub fn turkish_lower_entry(cp: u32) -> Option<(u8, [u32; 3])> {
181    entry_for(TURKISH_LOWER_FOLDING, cp)
182}
183
184fn entry_for(table: &'static [(u32, u32, u32, u32, u8)], cp: u32) -> Option<(u8, [u32; 3])> {
185    let idx = table.binary_search_by_key(&cp, |&(k, _, _, _, _)| k).ok()?;
186    let (_, a, b, c, n) = table[idx];
187    Some((n, [a, b, c]))
188}
189
190// ---------------------------------------------------------------------------
191// Wire-format encoders (used by the wasm-AOT data-section pass)
192// ---------------------------------------------------------------------------
193
194/// Encode a FULL fold table for the wasm data section.
195///
196/// Layout: 4-byte LE entry count followed by `count * 20` bytes —
197/// each entry is `(input_cp: u32 LE, out0: u32 LE, out1: u32 LE,
198/// out2: u32 LE, out_len: u32 LE)`. `out_len` is widened from `u8` on
199/// the wire to keep the per-entry stride at a clean 20 bytes, matching
200/// the runtime helper's `4 + mid * 20` rebase arithmetic.
201pub fn encode_full_table_bytes(table: &[(u32, u32, u32, u32, u8)]) -> Vec<u8> {
202    let mut bytes = Vec::with_capacity(4 + table.len() * 20);
203    bytes.extend_from_slice(&(table.len() as u32).to_le_bytes());
204    for (k, a, b, c, n) in table {
205        bytes.extend_from_slice(&k.to_le_bytes());
206        bytes.extend_from_slice(&a.to_le_bytes());
207        bytes.extend_from_slice(&b.to_le_bytes());
208        bytes.extend_from_slice(&c.to_le_bytes());
209        bytes.extend_from_slice(&(*n as u32).to_le_bytes());
210    }
211    bytes
212}
213
214/// Byte size of an encoded FULL fold table — header + 20 bytes per
215/// entry. Codegen uses this to pre-size the data section.
216pub fn encoded_full_table_size(table: &[(u32, u32, u32, u32, u8)]) -> usize {
217    4 + table.len() * 20
218}
219
220/// Encode a FULL table into the simple 8-byte stride layout the
221/// `__casefold_lookup` helper expects (one input cp -> one output cp).
222/// Multi-codepoint entries (out_len > 1) emit the FIRST output slot
223/// only — caller-side logic is expected to detect this case via a
224/// parallel `out_len` check, or to use the full 20-byte encoder when
225/// the trailing codepoints matter.
226///
227/// Used by the wasm-AOT locale-aware bodies to share the existing
228/// `__casefold_lookup` helper while still consulting the Turkish
229/// override entries (which today happen to all be 1:1 mappings).
230pub fn encode_simple_view_bytes(table: &[(u32, u32, u32, u32, u8)]) -> Vec<u8> {
231    let mut bytes = Vec::with_capacity(4 + table.len() * 8);
232    bytes.extend_from_slice(&(table.len() as u32).to_le_bytes());
233    for (k, a, _b, _c, _n) in table {
234        bytes.extend_from_slice(&k.to_le_bytes());
235        bytes.extend_from_slice(&a.to_le_bytes());
236    }
237    bytes
238}
239
240/// Encode a range table — same shape as the combining-mark /
241/// whitespace ranges. Delegates to [`super::encode_u32_pair_table`].
242pub fn encode_ranges_bytes(table: &[(u32, u32)]) -> Vec<u8> {
243    super::encode_u32_pair_table(table)
244}
245
246#[cfg(test)]
247mod tests {
248    use super::*;
249
250    #[test]
251    fn sharp_s_upper_is_ss() {
252        let (len, slots) = full_upper_entry(0x00DF).expect("ß entry");
253        assert_eq!(len, 2);
254        assert_eq!(&slots[..2], &[0x0053, 0x0053]);
255    }
256
257    #[test]
258    fn fi_ligature_upper_is_fi() {
259        let (len, slots) = full_upper_entry(0xFB01).expect("fi entry");
260        assert_eq!(len, 2);
261        assert_eq!(&slots[..2], &[0x0046, 0x0049]);
262    }
263
264    #[test]
265    fn capital_i_with_dot_lower_is_i_plus_dot() {
266        let (len, slots) = full_lower_entry(0x0130).expect("İ entry");
267        assert_eq!(len, 2);
268        assert_eq!(&slots[..2], &[0x0069, 0x0307]);
269    }
270
271    #[test]
272    fn turkish_capital_i_lower_is_dotless() {
273        let (len, slots) = turkish_lower_entry(0x0049).expect("tr I entry");
274        assert_eq!(len, 1);
275        assert_eq!(slots[0], 0x0131);
276    }
277
278    #[test]
279    fn cased_basic_letters() {
280        assert!(is_cased(0x0041)); // 'A'
281        assert!(is_cased(0x03A3)); // 'Σ'
282        assert!(!is_cased(0x0020)); // space
283        assert!(!is_cased(0x0030)); // '0'
284    }
285
286    #[test]
287    fn case_ignorable_basic_marks() {
288        assert!(is_case_ignorable(0x0027)); // apostrophe
289        assert!(is_case_ignorable(0x0301)); // combining acute
290        assert!(!is_case_ignorable(0x0041)); // 'A' is cased, not ignorable
291    }
292
293    #[test]
294    fn final_sigma_context_at_word_end() {
295        // "OΣ" with Σ at position 1 — has cased before, nothing after.
296        // (note: anchor codepoint is the sigma at index 1)
297        let cps: Vec<u32> = "OΣ".chars().map(|c| c as u32).collect();
298        assert!(is_final_sigma_context(&cps, 1));
299    }
300
301    #[test]
302    fn non_final_sigma_in_middle_of_word() {
303        // "ΣΑ" — sigma at start, cased letter follows.
304        let cps: Vec<u32> = "ΣΑ".chars().map(|c| c as u32).collect();
305        assert!(!is_final_sigma_context(&cps, 0));
306    }
307
308    #[test]
309    fn final_sigma_through_case_ignorable() {
310        // "OΣ'" — apostrophe is case-ignorable, sigma at index 1 is
311        // followed only by ignorables, so the form is final.
312        let cps: Vec<u32> = "OΣ'".chars().map(|c| c as u32).collect();
313        assert!(is_final_sigma_context(&cps, 1));
314    }
315
316    #[test]
317    fn locale_match_tr_and_az() {
318        assert!(is_turkish_locale("tr"));
319        assert!(is_turkish_locale("TR"));
320        assert!(is_turkish_locale("tr-TR"));
321        assert!(is_turkish_locale("az_AZ"));
322        assert!(!is_turkish_locale("en"));
323        assert!(!is_turkish_locale("de-DE"));
324        assert!(!is_turkish_locale(""));
325        assert!(!is_turkish_locale("tron")); // tr-prefix is not a locale match
326    }
327
328    #[test]
329    fn encode_full_table_layout() {
330        let toy: &[(u32, u32, u32, u32, u8)] = &[(0x00DF, 0x0053, 0x0053, 0x0000, 2)];
331        let bytes = encode_full_table_bytes(toy);
332        assert_eq!(bytes.len(), 4 + 20);
333        assert_eq!(&bytes[0..4], &1u32.to_le_bytes());
334        assert_eq!(&bytes[4..8], &0x00DFu32.to_le_bytes());
335        assert_eq!(&bytes[8..12], &0x0053u32.to_le_bytes());
336        assert_eq!(&bytes[20..24], &2u32.to_le_bytes());
337    }
338
339    #[test]
340    fn upper_table_sorted_and_non_empty() {
341        let table = FULL_UPPER_FOLDING;
342        assert!(!table.is_empty());
343        for win in table.windows(2) {
344            assert!(win[0].0 < win[1].0, "FULL upper table must be sorted asc");
345        }
346    }
347
348    #[test]
349    fn lower_table_sorted() {
350        let table = FULL_LOWER_FOLDING;
351        for win in table.windows(2) {
352            assert!(win[0].0 < win[1].0, "FULL lower table must be sorted asc");
353        }
354    }
355}