Skip to main content

shape_runtime/stdlib/
unicode.rs

1//! Native `unicode` module for Unicode text processing.
2//!
3//! Exports: unicode.normalize, unicode.category, unicode.is_letter, unicode.is_digit, unicode.graphemes
4//!
5//! Phase 2c: migrated to the typed marshal layer
6//! (`crate::marshal::register_typed_fn_N`). Native function bodies take
7//! typed Rust args via [`crate::marshal::FromSlot`]; their Rust signatures
8//! *are* the typed signatures. The Rust trait system rejects registration
9//! whose body's parameter types don't match.
10
11use crate::marshal::{register_typed_fn_1, register_typed_fn_2};
12use crate::module_exports::ModuleExports;
13use crate::typed_module_exports::{ConcreteReturn, ConcreteType, TypedReturn};
14use std::sync::Arc;
15
16/// Create the `unicode` module.
17pub fn create_unicode_module() -> ModuleExports {
18    let mut module = ModuleExports::new("std::core::unicode");
19    module.description = "Unicode text processing utilities".to_string();
20
21    // unicode.normalize(text: string, form: string) -> string
22    register_typed_fn_2::<_, Arc<String>, Arc<String>>(
23        &mut module,
24        "normalize",
25        "Normalize a Unicode string to the specified form",
26        [("text", "string"), ("form", "string")],
27        ConcreteType::String,
28        |text, form, _ctx| {
29            use unicode_normalization::UnicodeNormalization;
30
31            let normalized: String = match form.as_str() {
32                "NFC" => text.nfc().collect(),
33                "NFD" => text.nfd().collect(),
34                "NFKC" => text.nfkc().collect(),
35                "NFKD" => text.nfkd().collect(),
36                _ => {
37                    return Err(format!(
38                        "unicode.normalize(): unknown form '{}', expected NFC/NFD/NFKC/NFKD",
39                        form.as_str()
40                    ));
41                }
42            };
43
44            Ok(TypedReturn::Concrete(ConcreteReturn::String(normalized)))
45        },
46    );
47
48    // unicode.category(codepoint: int) -> string
49    register_typed_fn_1::<_, i64>(
50        &mut module,
51        "category",
52        "Get the Unicode general category of a codepoint",
53        "codepoint",
54        "int",
55        ConcreteType::String,
56        |cp, _ctx| {
57            let ch = char::from_u32(cp as u32)
58                .ok_or_else(|| format!("unicode.category(): invalid codepoint {}", cp))?;
59
60            Ok(TypedReturn::Concrete(ConcreteReturn::String(
61                unicode_general_category(ch).to_string(),
62            )))
63        },
64    );
65
66    // unicode.is_letter(char: string) -> bool
67    register_typed_fn_1::<_, Arc<String>>(
68        &mut module,
69        "is_letter",
70        "Check if the first character is a Unicode letter",
71        "char",
72        "string",
73        ConcreteType::Bool,
74        |s, _ctx| {
75            Ok(TypedReturn::Concrete(ConcreteReturn::Bool(
76                s.chars().next().map_or(false, |c| c.is_alphabetic()),
77            )))
78        },
79    );
80
81    // unicode.is_digit(char: string) -> bool
82    register_typed_fn_1::<_, Arc<String>>(
83        &mut module,
84        "is_digit",
85        "Check if the first character is a Unicode digit",
86        "char",
87        "string",
88        ConcreteType::Bool,
89        |s, _ctx| {
90            Ok(TypedReturn::Concrete(ConcreteReturn::Bool(
91                s.chars().next().map_or(false, |c| c.is_numeric()),
92            )))
93        },
94    );
95
96    // unicode.graphemes(text: string) -> Array<string>
97    register_typed_fn_1::<_, Arc<String>>(
98        &mut module,
99        "graphemes",
100        "Split a string into Unicode grapheme clusters",
101        "text",
102        "string",
103        ConcreteType::ArrayString,
104        |text, _ctx| {
105            use unicode_segmentation::UnicodeSegmentation;
106
107            let clusters: Vec<String> = text.graphemes(true).map(|g| g.to_string()).collect();
108            Ok(TypedReturn::Concrete(ConcreteReturn::ArrayString(clusters)))
109        },
110    );
111
112    module
113}
114
115/// Approximate Unicode general category using std::char classification.
116fn unicode_general_category(ch: char) -> &'static str {
117    if ch.is_uppercase() {
118        "Lu"
119    } else if ch.is_lowercase() {
120        "Ll"
121    } else if ch.is_alphabetic() {
122        "Lo"
123    } else if ch.is_ascii_digit() {
124        "Nd"
125    } else if ch.is_numeric() {
126        "No"
127    } else if ch.is_whitespace() {
128        "Zs"
129    } else if ch.is_control() {
130        "Cc"
131    } else if ch.is_ascii_punctuation() {
132        "Po"
133    } else {
134        "Cn"
135    }
136}
137
138#[cfg(test)]
139mod tests {
140    use super::*;
141
142    #[test]
143    fn test_unicode_module_creation() {
144        let module = create_unicode_module();
145        assert_eq!(module.name, "std::core::unicode");
146        assert!(module.has_export("normalize"));
147        assert!(module.has_export("category"));
148        assert!(module.has_export("is_letter"));
149        assert!(module.has_export("is_digit"));
150        assert!(module.has_export("graphemes"));
151    }
152
153    #[test]
154    fn test_unicode_typed_registry_populated() {
155        let module = create_unicode_module();
156        let typed = module.typed_exports();
157        assert!(typed.get("normalize").is_some());
158        assert!(typed.get("category").is_some());
159        assert!(typed.get("is_letter").is_some());
160        assert!(typed.get("is_digit").is_some());
161        assert!(typed.get("graphemes").is_some());
162        assert_eq!(typed.functions.len(), 5);
163    }
164
165    // Behavioural invocation tests removed — they used `module.invoke_export`
166    // with `ValueWord` arrays, which is the deleted dynamic-dispatch entry
167    // point. Behaviour is now covered through typed-slot dispatch via the
168    // marshal layer. End-to-end tests live in `shape-test`'s integration
169    // suite once the strict-typed cascade reaches shape-vm.
170}