Skip to main content

relon_unicode/
combining_marks.rs

1//! v3++ b-4 Unicode combining-mark range table embedded into the
2//! wasm-AOT `title` / `upper` / `lower` stdlib bodies.
3//!
4//! The table lists the inclusive `(start, end)` codepoint ranges of
5//! the Unicode `Mark` general category (M = Mn + Mc + Me). Marks have
6//! no case (Unicode treats them as non-cased), so the case-folding
7//! bodies emit them verbatim. For the `title` body the additional
8//! contract is that a Mark **does not** flip the `at_word_start`
9//! flag — a mark belongs to its base codepoint's grapheme cluster,
10//! so the next codepoint after the base+mark sequence should still
11//! be treated as "after the first letter of the word" rather than
12//! "first letter of a new word".
13//!
14//! Why hand-maintained:
15//!   * `std` does not expose Unicode general-category data.
16//!   * Pulling `icu_properties` (or `unicode-properties`) as a
17//!     build-dep just to enumerate Marks adds a multi-MB build-tree
18//!     dependency for a few hundred bytes of data.
19//!   * Mark ranges are stable across Unicode revisions — new ranges
20//!     get appended over time, but existing ranges never shrink, so a
21//!     hand-maintained table only needs an additive update per
22//!     Unicode release.
23//!
24//! Coverage version: **Unicode 14.0.0** (matches the same Unicode
25//! revision the case-folding tables were derived against in v3+ a-4).
26//! When the host toolchain bumps its bundled UCD, append any new
27//! ranges to `COMBINING_MARK_RANGES` and bump the comment below.
28//!
29//! Layout invariants (the wasm runtime helper relies on these):
30//!   * Sorted ascending by `start`.
31//!   * Non-overlapping (binary search assumes `prev.end < next.start`).
32//!   * `start <= end` for every range.
33//!
34//! The encoded byte layout the wasm body binary-searches mirrors the
35//! case-folding table: a leading u32 LE count followed by
36//! `count * (u32 LE start, u32 LE end)` pairs.
37
38/// Unicode 14.0.0 Mark category ranges (Mn + Mc + Me), sorted
39/// ascending by `start`, inclusive on both ends.
40///
41/// The list is hand-curated against the published UCD blocks. Marks
42/// for scripts that ship after Unicode 14 (e.g. Tangsa post-15)
43/// should be appended on the next UCD bump.
44#[rustfmt::skip]
45pub const COMBINING_MARK_RANGES: &[(u32, u32)] = &[
46    // Combining Diacritical Marks (Mn).
47    (0x0300, 0x036F),
48    // Cyrillic Supplement / Cyrillic Extended-A (Mn).
49    (0x0483, 0x0489),
50    // Hebrew (Mn).
51    (0x0591, 0x05BD),
52    (0x05BF, 0x05BF),
53    (0x05C1, 0x05C2),
54    (0x05C4, 0x05C5),
55    (0x05C7, 0x05C7),
56    // Arabic (Mn + Mc).
57    (0x0610, 0x061A),
58    (0x064B, 0x065F),
59    (0x0670, 0x0670),
60    (0x06D6, 0x06DC),
61    (0x06DF, 0x06E4),
62    (0x06E7, 0x06E8),
63    (0x06EA, 0x06ED),
64    // Syriac (Mn).
65    (0x0711, 0x0711),
66    (0x0730, 0x074A),
67    // Thaana (Mn).
68    (0x07A6, 0x07B0),
69    // NKo (Mn).
70    (0x07EB, 0x07F3),
71    (0x07FD, 0x07FD),
72    // Samaritan (Mn + Mc).
73    (0x0816, 0x0819),
74    (0x081B, 0x0823),
75    (0x0825, 0x0827),
76    (0x0829, 0x082D),
77    // Mandaic (Mn).
78    (0x0859, 0x085B),
79    // Arabic Extended-A (Mn).
80    (0x08D3, 0x08E1),
81    (0x08E3, 0x0902),
82    // Devanagari (Mn + Mc + Me).
83    (0x0903, 0x0903),
84    (0x093A, 0x093A),
85    (0x093B, 0x093B),
86    (0x093C, 0x093C),
87    (0x093E, 0x094F),
88    (0x0951, 0x0957),
89    (0x0962, 0x0963),
90    // Bengali.
91    (0x0981, 0x0983),
92    (0x09BC, 0x09BC),
93    (0x09BE, 0x09C4),
94    (0x09C7, 0x09C8),
95    (0x09CB, 0x09CD),
96    (0x09D7, 0x09D7),
97    (0x09E2, 0x09E3),
98    (0x09FE, 0x09FE),
99    // Gurmukhi.
100    (0x0A01, 0x0A03),
101    (0x0A3C, 0x0A3C),
102    (0x0A3E, 0x0A42),
103    (0x0A47, 0x0A48),
104    (0x0A4B, 0x0A4D),
105    (0x0A51, 0x0A51),
106    (0x0A70, 0x0A71),
107    (0x0A75, 0x0A75),
108    // Gujarati.
109    (0x0A81, 0x0A83),
110    (0x0ABC, 0x0ABC),
111    (0x0ABE, 0x0AC5),
112    (0x0AC7, 0x0AC9),
113    (0x0ACB, 0x0ACD),
114    (0x0AE2, 0x0AE3),
115    (0x0AFA, 0x0AFF),
116    // Oriya.
117    (0x0B01, 0x0B03),
118    (0x0B3C, 0x0B3C),
119    (0x0B3E, 0x0B44),
120    (0x0B47, 0x0B48),
121    (0x0B4B, 0x0B4D),
122    (0x0B55, 0x0B57),
123    (0x0B62, 0x0B63),
124    // Tamil.
125    (0x0B82, 0x0B82),
126    (0x0BBE, 0x0BC2),
127    (0x0BC6, 0x0BC8),
128    (0x0BCA, 0x0BCD),
129    (0x0BD7, 0x0BD7),
130    // Telugu.
131    (0x0C00, 0x0C04),
132    (0x0C3C, 0x0C3C),
133    (0x0C3E, 0x0C44),
134    (0x0C46, 0x0C48),
135    (0x0C4A, 0x0C4D),
136    (0x0C55, 0x0C56),
137    (0x0C62, 0x0C63),
138    (0x0C81, 0x0C83),
139    // Kannada.
140    (0x0CBC, 0x0CBC),
141    (0x0CBE, 0x0CC4),
142    (0x0CC6, 0x0CC8),
143    (0x0CCA, 0x0CCD),
144    (0x0CD5, 0x0CD6),
145    (0x0CE2, 0x0CE3),
146    // Malayalam.
147    (0x0D00, 0x0D03),
148    (0x0D3B, 0x0D3C),
149    (0x0D3E, 0x0D44),
150    (0x0D46, 0x0D48),
151    (0x0D4A, 0x0D4D),
152    (0x0D57, 0x0D57),
153    (0x0D62, 0x0D63),
154    (0x0D81, 0x0D83),
155    // Sinhala.
156    (0x0DCA, 0x0DCA),
157    (0x0DCF, 0x0DD4),
158    (0x0DD6, 0x0DD6),
159    (0x0DD8, 0x0DDF),
160    (0x0DF2, 0x0DF3),
161    // Thai.
162    (0x0E31, 0x0E31),
163    (0x0E34, 0x0E3A),
164    (0x0E47, 0x0E4E),
165    // Lao.
166    (0x0EB1, 0x0EB1),
167    (0x0EB4, 0x0EBC),
168    (0x0EC8, 0x0ECD),
169    // Tibetan.
170    (0x0F18, 0x0F19),
171    (0x0F35, 0x0F35),
172    (0x0F37, 0x0F37),
173    (0x0F39, 0x0F39),
174    (0x0F3E, 0x0F3F),
175    (0x0F71, 0x0F84),
176    (0x0F86, 0x0F87),
177    (0x0F8D, 0x0F97),
178    (0x0F99, 0x0FBC),
179    (0x0FC6, 0x0FC6),
180    // Myanmar.
181    (0x102B, 0x103E),
182    (0x1056, 0x1059),
183    (0x105E, 0x1060),
184    (0x1062, 0x1064),
185    (0x1067, 0x106D),
186    (0x1071, 0x1074),
187    (0x1082, 0x108D),
188    (0x108F, 0x108F),
189    (0x109A, 0x109D),
190    // Ethiopic.
191    (0x135D, 0x135F),
192    // Tagalog.
193    (0x1712, 0x1715),
194    (0x1732, 0x1734),
195    (0x1752, 0x1753),
196    (0x1772, 0x1773),
197    // Khmer.
198    (0x17B4, 0x17D3),
199    (0x17DD, 0x17DD),
200    // Mongolian.
201    (0x180B, 0x180D),
202    (0x1885, 0x1886),
203    (0x18A9, 0x18A9),
204    // Limbu.
205    (0x1920, 0x192B),
206    (0x1930, 0x193B),
207    // Buginese.
208    (0x1A17, 0x1A1B),
209    // Tai Tham.
210    (0x1A55, 0x1A5E),
211    (0x1A60, 0x1A7C),
212    (0x1A7F, 0x1A7F),
213    (0x1AB0, 0x1ACE),
214    // Balinese.
215    (0x1B00, 0x1B04),
216    (0x1B34, 0x1B44),
217    (0x1B6B, 0x1B73),
218    (0x1B80, 0x1B82),
219    (0x1BA1, 0x1BAD),
220    (0x1BE6, 0x1BF3),
221    // Batak / Lepcha / others.
222    (0x1C24, 0x1C37),
223    (0x1CD0, 0x1CD2),
224    (0x1CD4, 0x1CE8),
225    (0x1CED, 0x1CED),
226    (0x1CF4, 0x1CF4),
227    (0x1CF7, 0x1CF9),
228    (0x1DC0, 0x1DFF),
229    // Combining Diacritical Marks for Symbols.
230    (0x20D0, 0x20F0),
231    // Coptic.
232    (0x2CEF, 0x2CF1),
233    // Tifinagh.
234    (0x2D7F, 0x2D7F),
235    // Combining Half Marks.
236    (0x2DE0, 0x2DFF),
237    // CJK ideographic combining marks.
238    (0x302A, 0x302F),
239    (0x3099, 0x309A),
240    // Combining Cyrillic Letter ranges (Cyrillic Extended-B).
241    (0xA66F, 0xA672),
242    (0xA674, 0xA67D),
243    (0xA69E, 0xA69F),
244    // Bamum / Syloti.
245    (0xA6F0, 0xA6F1),
246    (0xA802, 0xA802),
247    (0xA806, 0xA806),
248    (0xA80B, 0xA80B),
249    (0xA823, 0xA827),
250    (0xA82C, 0xA82C),
251    // Saurashtra / Devanagari Extended.
252    (0xA880, 0xA881),
253    (0xA8B4, 0xA8C5),
254    (0xA8E0, 0xA8F1),
255    (0xA8FF, 0xA8FF),
256    (0xA926, 0xA92D),
257    (0xA947, 0xA953),
258    (0xA980, 0xA983),
259    (0xA9B3, 0xA9C0),
260    (0xA9E5, 0xA9E5),
261    (0xAA29, 0xAA36),
262    (0xAA43, 0xAA43),
263    (0xAA4C, 0xAA4D),
264    (0xAA7B, 0xAA7D),
265    (0xAAB0, 0xAAB0),
266    (0xAAB2, 0xAAB4),
267    (0xAAB7, 0xAAB8),
268    (0xAABE, 0xAABF),
269    (0xAAC1, 0xAAC1),
270    (0xAAEB, 0xAAEF),
271    (0xAAF5, 0xAAF6),
272    (0xABE3, 0xABEA),
273    (0xABEC, 0xABED),
274    // Hebrew presentation forms.
275    (0xFB1E, 0xFB1E),
276    // Variation Selectors.
277    (0xFE00, 0xFE0F),
278    (0xFE20, 0xFE2F),
279    // Supplementary planes (selected — common scripts that ship with
280    // marks). The wasm body skips marks in the entire table; the
281    // listing below covers Phoenician / Brahmic / South Asian scripts
282    // that real-world inputs hit. Future Unicode bumps just append
283    // here.
284    (0x101FD, 0x101FD),
285    (0x102E0, 0x102E0),
286    (0x10376, 0x1037A),
287    (0x10A01, 0x10A03),
288    (0x10A05, 0x10A06),
289    (0x10A0C, 0x10A0F),
290    (0x10A38, 0x10A3A),
291    (0x10A3F, 0x10A3F),
292    (0x10AE5, 0x10AE6),
293    (0x10D24, 0x10D27),
294    (0x10EAB, 0x10EAC),
295    (0x10F46, 0x10F50),
296    (0x10F82, 0x10F85),
297    (0x11000, 0x11002),
298    (0x11038, 0x11046),
299    (0x11070, 0x11070),
300    (0x11073, 0x11074),
301    (0x1107F, 0x11082),
302    (0x110B0, 0x110BA),
303    (0x110C2, 0x110C2),
304    (0x11100, 0x11102),
305    (0x11127, 0x11134),
306    (0x11145, 0x11146),
307    (0x11173, 0x11173),
308    (0x11180, 0x11182),
309    (0x111B3, 0x111C0),
310    (0x111C9, 0x111CC),
311    (0x111CE, 0x111CF),
312    (0x1122C, 0x11237),
313    (0x1123E, 0x1123E),
314    (0x112DF, 0x112EA),
315    (0x11300, 0x11303),
316    (0x1133B, 0x1133C),
317    (0x1133E, 0x11344),
318    (0x11347, 0x11348),
319    (0x1134B, 0x1134D),
320    (0x11357, 0x11357),
321    (0x11362, 0x11363),
322    (0x11366, 0x1136C),
323    (0x11370, 0x11374),
324    (0x11435, 0x11446),
325    (0x1145E, 0x1145E),
326    (0x114B0, 0x114C3),
327    (0x115AF, 0x115B5),
328    (0x115B8, 0x115C0),
329    (0x115DC, 0x115DD),
330    (0x11630, 0x11640),
331    (0x116AB, 0x116B7),
332    (0x1171D, 0x1172B),
333    (0x1182C, 0x1183A),
334    (0x11930, 0x11935),
335    (0x11937, 0x11938),
336    (0x1193B, 0x1193E),
337    (0x11940, 0x11940),
338    (0x11942, 0x11943),
339    (0x119D1, 0x119D7),
340    (0x119DA, 0x119E0),
341    (0x119E4, 0x119E4),
342    (0x11A01, 0x11A0A),
343    (0x11A33, 0x11A39),
344    (0x11A3B, 0x11A3E),
345    (0x11A47, 0x11A47),
346    (0x11A51, 0x11A5B),
347    (0x11A8A, 0x11A99),
348    (0x11C2F, 0x11C36),
349    (0x11C38, 0x11C3F),
350    (0x11C92, 0x11CA7),
351    (0x11CA9, 0x11CB6),
352    (0x11D31, 0x11D36),
353    (0x11D3A, 0x11D3A),
354    (0x11D3C, 0x11D3D),
355    (0x11D3F, 0x11D45),
356    (0x11D47, 0x11D47),
357    (0x11D8A, 0x11D8E),
358    (0x11D90, 0x11D91),
359    (0x11D93, 0x11D97),
360    (0x11EF3, 0x11EF6),
361    (0x16AF0, 0x16AF4),
362    (0x16B30, 0x16B36),
363    (0x16F4F, 0x16F4F),
364    (0x16F51, 0x16F87),
365    (0x16F8F, 0x16F92),
366    (0x16FE4, 0x16FE4),
367    (0x16FF0, 0x16FF1),
368    (0x1BC9D, 0x1BC9E),
369    (0x1CF00, 0x1CF2D),
370    (0x1CF30, 0x1CF46),
371    (0x1D165, 0x1D169),
372    (0x1D16D, 0x1D172),
373    (0x1D17B, 0x1D182),
374    (0x1D185, 0x1D18B),
375    (0x1D1AA, 0x1D1AD),
376    (0x1D242, 0x1D244),
377    (0x1DA00, 0x1DA36),
378    (0x1DA3B, 0x1DA6C),
379    (0x1DA75, 0x1DA75),
380    (0x1DA84, 0x1DA84),
381    (0x1DA9B, 0x1DA9F),
382    (0x1DAA1, 0x1DAAF),
383    (0x1E000, 0x1E006),
384    (0x1E008, 0x1E018),
385    (0x1E01B, 0x1E021),
386    (0x1E023, 0x1E024),
387    (0x1E026, 0x1E02A),
388    (0x1E130, 0x1E136),
389    (0x1E2AE, 0x1E2AE),
390    (0x1E2EC, 0x1E2EF),
391    (0x1E8D0, 0x1E8D6),
392    (0x1E944, 0x1E94A),
393    // Variation Selectors Supplement.
394    (0xE0100, 0xE01EF),
395];
396
397/// Public view of the combining-mark ranges. Sorted ascending by
398/// the start of each range; the wasm runtime helper depends on this
399/// invariant for its binary search.
400pub fn combining_mark_ranges() -> &'static [(u32, u32)] {
401    COMBINING_MARK_RANGES
402}
403
404/// Encode the combining-mark range table into the wasm data-section
405/// layout. Delegates to [`super::encode_u32_pair_table`].
406pub fn encode_ranges_bytes(table: &[(u32, u32)]) -> Vec<u8> {
407    super::encode_u32_pair_table(table)
408}
409
410/// Byte size of the encoded ranges table.
411pub fn encoded_ranges_size(table: &[(u32, u32)]) -> usize {
412    super::encoded_u32_pair_table_size(table.len())
413}
414
415/// Compile-time check (mirrors the runtime contract). Returns true
416/// when `cp` falls inside any of the ranges in
417/// [`COMBINING_MARK_RANGES`]. The wasm body uses a binary-search
418/// loop instead so the per-codepoint cost stays O(log N) — this
419/// helper is only used by the unit tests and the tree-walk
420/// evaluator's title implementation.
421pub fn is_combining_mark(cp: u32) -> bool {
422    super::cp_in_ranges(cp, COMBINING_MARK_RANGES)
423}
424
425#[cfg(test)]
426mod tests {
427    use super::*;
428
429    #[test]
430    fn ranges_sorted_non_overlapping() {
431        let table = COMBINING_MARK_RANGES;
432        for win in table.windows(2) {
433            let (_, prev_end) = win[0];
434            let (next_start, next_end) = win[1];
435            assert!(
436                prev_end < next_start,
437                "combining-mark ranges must be sorted + non-overlapping; \
438                 prev_end={prev_end:#x} >= next_start={next_start:#x}"
439            );
440            assert!(
441                next_start <= next_end,
442                "range start must be <= end; got {next_start:#x}..={next_end:#x}"
443            );
444        }
445    }
446
447    #[test]
448    fn common_combining_marks_present() {
449        // U+0301 (Combining Acute Accent) is the canonical Mn example.
450        assert!(is_combining_mark(0x0301));
451        // U+0302 (Combining Circumflex).
452        assert!(is_combining_mark(0x0302));
453        // U+0308 (Combining Diaeresis).
454        assert!(is_combining_mark(0x0308));
455        // U+FE0F (Variation Selector-16, emoji presentation).
456        assert!(is_combining_mark(0xFE0F));
457        // U+200D (Zero-Width Joiner) is NOT a mark (it's a Format
458        // char). Confirms the ZWJ path stays orthogonal to the
459        // mark detection.
460        assert!(!is_combining_mark(0x200D));
461    }
462
463    #[test]
464    fn ascii_letters_not_marks() {
465        for cp in 0x20u32..0x7F {
466            assert!(
467                !is_combining_mark(cp),
468                "ascii cp {cp:#x} must not be detected as a Mark"
469            );
470        }
471    }
472
473    #[test]
474    fn encode_ranges_layout() {
475        let toy: &[(u32, u32)] = &[(0x300, 0x36F), (0x483, 0x489)];
476        let bytes = encode_ranges_bytes(toy);
477        assert_eq!(bytes.len(), 4 + 16);
478        assert_eq!(&bytes[0..4], &2u32.to_le_bytes());
479        assert_eq!(&bytes[4..8], &0x300u32.to_le_bytes());
480        assert_eq!(&bytes[8..12], &0x36Fu32.to_le_bytes());
481        assert_eq!(&bytes[12..16], &0x483u32.to_le_bytes());
482        assert_eq!(&bytes[16..20], &0x489u32.to_le_bytes());
483    }
484}