relon_unicode/lib.rs
1// Relaxed from `forbid` to `deny` so the v3++ item 4 SIMD ASCII fast
2// path (`ascii_fold_simd`) can use wasm32 `v128_load` / `v128_store`
3// intrinsics, both of which are `unsafe fn` in `core::arch::wasm32`.
4// The `unsafe` blocks are confined to that single module behind a
5// `#[allow(unsafe_code)]` and each has a SAFETY comment; the rest of
6// the crate stays unsafe-free.
7#![deny(unsafe_code)]
8#![deny(unsafe_op_in_unsafe_fn)]
9
10//! Unicode-aware tables, algorithms, and the glob matcher shared by
11//! the tree-walk evaluator and the wasm-AOT / native codegen
12//! backends.
13//!
14//! This crate is a **leaf**: it depends on no other `relon-*` crate
15//! (matching `relon-util` / `relon-cap`), so it sits at the very
16//! bottom of the workspace dep graph. It consolidates every Unicode
17//! dataset, the SIMD ASCII fast path, and the linear-time glob
18//! matcher that previously lived under `relon-ir/src/unicode/` and
19//! `relon-ir/src/glob.rs`. Pulling them into a standalone crate lets
20//! `relon-evaluator` consume the shared tables without an edge to
21//! `relon-ir` (the evaluator is a tree-walk engine and never touches
22//! the IR surface), keeping the dep graph honest.
23//!
24//! `relon-ir` keeps same-named re-exports so the codegen backends
25//! that reach for `relon_ir::ascii_fold_simd` / `relon_ir::glob` /
26//! etc. compile unchanged.
27//!
28//! ### Module map
29//!
30//! * [`case_folding`] — UCD simple (1:1) upper / lower folding tables,
31//! generated at build time from `char::to_uppercase` /
32//! `char::to_lowercase`. Drives the wasm-AOT `__casefold_lookup`
33//! helper.
34//! * [`full_case_folding`] — UAX #21 full case folding (multi-codepoint
35//! mappings, Greek final sigma, Turkish / Azerbaijani locale
36//! overrides). Generated from `data/SpecialCasing.txt` via
37//! `tools/gen_full_case_folding.py`.
38//! * `full_case_folding_data` — raw generated tables for
39//! `full_case_folding`. Pulled in via `include!()` from
40//! `full_case_folding.rs` rather than declared as a sibling
41//! module, matching the pre-split layout so the generated symbols
42//! stay in a single namespace.
43//! * [`combining_marks`] — Mn + Mc + Me range table used by every
44//! case-fold body to decide whether a codepoint resets the word
45//! boundary.
46//! * [`whitespace`] — non-ASCII `White_Space` ranges (the ASCII subset
47//! is special-cased on the wasm fast path).
48//! * [`normalization`] — UAX #15 NFD / NFKD / NFC / NFKC algorithms
49//! on top of the [`normalization_data`] tables. UCD version pinned
50//! at 14.0.0; regenerate via `tools/gen_normalization_tables.py`.
51//! * [`normalization_data`] — generated UCD 14.0.0 decomposition,
52//! canonical-combining-class, and composition-pair tables.
53//! * [`ascii_fold_simd`] — v3++ item 4 SIMD ASCII fast path for the
54//! tree-walk `upper` / `lower` / `title` bodies. Only the wasm32
55//! arm uses `unsafe` v128 intrinsics; other targets stay on the
56//! chunked scalar fallback.
57//! * [`glob`] — linear-time Unicode-aware glob matcher backing the
58//! `glob_match(s, pattern) -> Bool` stdlib function.
59//!
60//! UCD version: Unicode 14.0.0 across every regeneration script.
61//! When a future Unicode bump lands, regenerate the four data-bearing
62//! siblings in one commit so the wasm-AOT data section and the
63//! tree-walk algorithm stay consistent.
64
65pub mod ascii_fold_simd;
66pub mod case_folding;
67pub mod combining_marks;
68pub mod full_case_folding;
69pub mod glob;
70pub mod normalization;
71pub mod normalization_data;
72pub mod whitespace;
73
74/// Encode a `(u32, u32)` table into the wasm data-section layout
75/// shared by case-folding, combining-mark, whitespace, and full-fold
76/// range tables: `[count: u32 LE][(a: u32 LE, b: u32 LE) × N]`. The
77/// runtime helpers all binary-search with the same `(addr + 4 + mid *
78/// 8)` rebase arithmetic, so the byte format is identical regardless
79/// of whether the pair encodes `(input_cp, output_cp)` or `(start,
80/// end)`.
81pub fn encode_u32_pair_table(table: &[(u32, u32)]) -> Vec<u8> {
82 let mut bytes = Vec::with_capacity(encoded_u32_pair_table_size(table.len()));
83 bytes.extend_from_slice(&(table.len() as u32).to_le_bytes());
84 for (a, b) in table {
85 bytes.extend_from_slice(&a.to_le_bytes());
86 bytes.extend_from_slice(&b.to_le_bytes());
87 }
88 bytes
89}
90
91/// Byte size of [`encode_u32_pair_table`]'s output — header + 8 bytes
92/// per entry. Codegen calls this to pre-size data sections.
93pub fn encoded_u32_pair_table_size(len: usize) -> usize {
94 4 + len * 8
95}
96
97/// Binary-search a sorted `(start, end)` range table for `cp` — used
98/// by every compile-time membership predicate (whitespace,
99/// combining-marks, full-fold locale ranges). The wasm body emits the
100/// same comparison via a hand-unrolled loop instead so the per-cp cost
101/// stays O(log N) on both sides.
102pub fn cp_in_ranges(cp: u32, ranges: &[(u32, u32)]) -> bool {
103 ranges
104 .binary_search_by(|&(lo, hi)| {
105 if cp < lo {
106 std::cmp::Ordering::Greater
107 } else if cp > hi {
108 std::cmp::Ordering::Less
109 } else {
110 std::cmp::Ordering::Equal
111 }
112 })
113 .is_ok()
114}