relon_unicode/case_folding.rs
1//! v3+ a-4 Unicode-aware case folding tables embedded into wasm-AOT
2//! `upper` / `lower` stdlib bodies.
3//!
4//! The tables hold **simple** case-folding mappings only — each entry
5//! is a single-codepoint input paired with a single-codepoint
6//! replacement. Multi-codepoint cases (e.g. German `ß` -> `SS`,
7//! Latin small ligatures, Armenian `\u{0587}` -> `ԵՒ`) are excluded
8//! from the simple-folding pass and pass through unchanged in the
9//! wasm body. A full case folding pass that handles those is a v3++
10//! item.
11//!
12//! Both tables are sorted ascending by input codepoint so the wasm
13//! body's binary search keeps a stable contract. The build.rs sibling
14//! generates them at compile time from `char::to_uppercase` /
15//! `char::to_lowercase` — Rust's stdlib pulls the data from the
16//! bundled Unicode tables, which means our table tracks whichever
17//! Unicode version the host toolchain was built against.
18
19// The build.rs generates this file with `pub(crate)` visibility so
20// the IR crate's internal modules can access it. We re-export the
21// tables via the helper module functions below so the codegen crate
22// pulls the data through a stable surface.
23include!(concat!(env!("OUT_DIR"), "/case_folding_table.rs"));
24
25// Re-export through `pub` so the codegen crate can splice the table
26// bytes into the wasm data section.
27//
28// Visibility note: the generated file declares these as `pub(crate)`,
29// which is too narrow for the codegen crate. We work around it by
30// wrapping with `pub` wrappers; callers go through these instead of
31// touching the `pub(crate)` consts directly.
32/// Public view of the simple upper case-folding table. Sorted by the
33/// input codepoint ascending.
34pub fn simple_upper_folding() -> &'static [(u32, u32)] {
35 SIMPLE_UPPER_FOLDING
36}
37
38/// Public view of the simple lower case-folding table. Sorted by the
39/// input codepoint ascending.
40pub fn simple_lower_folding() -> &'static [(u32, u32)] {
41 SIMPLE_LOWER_FOLDING
42}
43
44/// Encode the case-folding table into the wasm data-section layout.
45/// Delegates to [`super::encode_u32_pair_table`] — entries encode
46/// `(input_cp, output_cp)` pairs but the byte layout matches the
47/// range tables so the runtime helper can share rebase arithmetic.
48pub fn encode_table_bytes(table: &[(u32, u32)]) -> Vec<u8> {
49 super::encode_u32_pair_table(table)
50}
51
52/// Byte size of the encoded case-folding table.
53pub fn encoded_table_size(table: &[(u32, u32)]) -> usize {
54 super::encoded_u32_pair_table_size(table.len())
55}
56
57#[cfg(test)]
58mod tests {
59 use super::*;
60
61 #[test]
62 fn upper_table_sorted_and_non_empty() {
63 let table = SIMPLE_UPPER_FOLDING;
64 assert!(!table.is_empty(), "upper table must not be empty");
65 for win in table.windows(2) {
66 assert!(win[0].0 < win[1].0, "upper table must be sorted asc");
67 }
68 }
69
70 #[test]
71 fn lower_table_sorted_and_non_empty() {
72 let table = SIMPLE_LOWER_FOLDING;
73 assert!(!table.is_empty(), "lower table must not be empty");
74 for win in table.windows(2) {
75 assert!(win[0].0 < win[1].0, "lower table must be sorted asc");
76 }
77 }
78
79 #[test]
80 fn ascii_letters_present() {
81 // ASCII a -> A and A -> a must be in the simple-folding tables.
82 let upper = SIMPLE_UPPER_FOLDING
83 .iter()
84 .find(|(k, _)| *k == 'a' as u32)
85 .expect("a -> A mapping");
86 assert_eq!(upper.1, 'A' as u32);
87 let lower = SIMPLE_LOWER_FOLDING
88 .iter()
89 .find(|(k, _)| *k == 'A' as u32)
90 .expect("A -> a mapping");
91 assert_eq!(lower.1, 'a' as u32);
92 }
93
94 #[test]
95 fn cyrillic_letters_present() {
96 // U+0420 CYRILLIC CAPITAL LETTER ER -> U+0440 small er.
97 let lower = SIMPLE_LOWER_FOLDING
98 .iter()
99 .find(|(k, _)| *k == 0x0420)
100 .expect("Р -> р mapping");
101 assert_eq!(lower.1, 0x0440);
102 let upper = SIMPLE_UPPER_FOLDING
103 .iter()
104 .find(|(k, _)| *k == 0x0440)
105 .expect("р -> Р mapping");
106 assert_eq!(upper.1, 0x0420);
107 }
108
109 #[test]
110 fn encode_table_bytes_layout() {
111 let toy: &[(u32, u32)] = &[(0x61, 0x41), (0x62, 0x42)];
112 let bytes = encode_table_bytes(toy);
113 assert_eq!(bytes.len(), 4 + 16);
114 // Header is a little-endian u32 count.
115 assert_eq!(&bytes[0..4], &2u32.to_le_bytes());
116 // First entry payload: (0x61, 0x41).
117 assert_eq!(&bytes[4..8], &0x61u32.to_le_bytes());
118 assert_eq!(&bytes[8..12], &0x41u32.to_le_bytes());
119 }
120}