Skip to main content

relon_unicode/
case_folding.rs

1//! v3+ a-4 Unicode-aware case folding tables embedded into wasm-AOT
2//! `upper` / `lower` stdlib bodies.
3//!
4//! The tables hold **simple** case-folding mappings only — each entry
5//! is a single-codepoint input paired with a single-codepoint
6//! replacement. Multi-codepoint cases (e.g. German `ß` -> `SS`,
7//! Latin small ligatures, Armenian `\u{0587}` -> `ԵՒ`) are excluded
8//! from the simple-folding pass and pass through unchanged in the
9//! wasm body. A full case folding pass that handles those is a v3++
10//! item.
11//!
12//! Both tables are sorted ascending by input codepoint so the wasm
13//! body's binary search keeps a stable contract. The build.rs sibling
14//! generates them at compile time from `char::to_uppercase` /
15//! `char::to_lowercase` — Rust's stdlib pulls the data from the
16//! bundled Unicode tables, which means our table tracks whichever
17//! Unicode version the host toolchain was built against.
18
19// The build.rs generates this file with `pub(crate)` visibility so
20// the IR crate's internal modules can access it. We re-export the
21// tables via the helper module functions below so the codegen crate
22// pulls the data through a stable surface.
23include!(concat!(env!("OUT_DIR"), "/case_folding_table.rs"));
24
25// Re-export through `pub` so the codegen crate can splice the table
26// bytes into the wasm data section.
27//
28// Visibility note: the generated file declares these as `pub(crate)`,
29// which is too narrow for the codegen crate. We work around it by
30// wrapping with `pub` wrappers; callers go through these instead of
31// touching the `pub(crate)` consts directly.
32/// Public view of the simple upper case-folding table. Sorted by the
33/// input codepoint ascending.
34pub fn simple_upper_folding() -> &'static [(u32, u32)] {
35    SIMPLE_UPPER_FOLDING
36}
37
38/// Public view of the simple lower case-folding table. Sorted by the
39/// input codepoint ascending.
40pub fn simple_lower_folding() -> &'static [(u32, u32)] {
41    SIMPLE_LOWER_FOLDING
42}
43
44/// Encode the case-folding table into the wasm data-section layout.
45/// Delegates to [`super::encode_u32_pair_table`] — entries encode
46/// `(input_cp, output_cp)` pairs but the byte layout matches the
47/// range tables so the runtime helper can share rebase arithmetic.
48pub fn encode_table_bytes(table: &[(u32, u32)]) -> Vec<u8> {
49    super::encode_u32_pair_table(table)
50}
51
52/// Byte size of the encoded case-folding table.
53pub fn encoded_table_size(table: &[(u32, u32)]) -> usize {
54    super::encoded_u32_pair_table_size(table.len())
55}
56
57#[cfg(test)]
58mod tests {
59    use super::*;
60
61    #[test]
62    fn upper_table_sorted_and_non_empty() {
63        let table = SIMPLE_UPPER_FOLDING;
64        assert!(!table.is_empty(), "upper table must not be empty");
65        for win in table.windows(2) {
66            assert!(win[0].0 < win[1].0, "upper table must be sorted asc");
67        }
68    }
69
70    #[test]
71    fn lower_table_sorted_and_non_empty() {
72        let table = SIMPLE_LOWER_FOLDING;
73        assert!(!table.is_empty(), "lower table must not be empty");
74        for win in table.windows(2) {
75            assert!(win[0].0 < win[1].0, "lower table must be sorted asc");
76        }
77    }
78
79    #[test]
80    fn ascii_letters_present() {
81        // ASCII a -> A and A -> a must be in the simple-folding tables.
82        let upper = SIMPLE_UPPER_FOLDING
83            .iter()
84            .find(|(k, _)| *k == 'a' as u32)
85            .expect("a -> A mapping");
86        assert_eq!(upper.1, 'A' as u32);
87        let lower = SIMPLE_LOWER_FOLDING
88            .iter()
89            .find(|(k, _)| *k == 'A' as u32)
90            .expect("A -> a mapping");
91        assert_eq!(lower.1, 'a' as u32);
92    }
93
94    #[test]
95    fn cyrillic_letters_present() {
96        // U+0420 CYRILLIC CAPITAL LETTER ER -> U+0440 small er.
97        let lower = SIMPLE_LOWER_FOLDING
98            .iter()
99            .find(|(k, _)| *k == 0x0420)
100            .expect("Р -> р mapping");
101        assert_eq!(lower.1, 0x0440);
102        let upper = SIMPLE_UPPER_FOLDING
103            .iter()
104            .find(|(k, _)| *k == 0x0440)
105            .expect("р -> Р mapping");
106        assert_eq!(upper.1, 0x0420);
107    }
108
109    #[test]
110    fn encode_table_bytes_layout() {
111        let toy: &[(u32, u32)] = &[(0x61, 0x41), (0x62, 0x42)];
112        let bytes = encode_table_bytes(toy);
113        assert_eq!(bytes.len(), 4 + 16);
114        // Header is a little-endian u32 count.
115        assert_eq!(&bytes[0..4], &2u32.to_le_bytes());
116        // First entry payload: (0x61, 0x41).
117        assert_eq!(&bytes[4..8], &0x61u32.to_le_bytes());
118        assert_eq!(&bytes[8..12], &0x41u32.to_le_bytes());
119    }
120}