Skip to main content

shape_runtime/stdlib/
unicode.rs

1//! Native `unicode` module for Unicode text processing.
2//!
3//! Exports: unicode.normalize, unicode.category, unicode.is_letter, unicode.is_digit, unicode.graphemes
4
5use crate::module_exports::{ModuleContext, ModuleExports, ModuleFunction, ModuleParam};
6use shape_value::ValueWord;
7use std::sync::Arc;
8
9/// Create the `unicode` module.
10pub fn create_unicode_module() -> ModuleExports {
11    let mut module = ModuleExports::new("unicode");
12    module.description = "Unicode text processing utilities".to_string();
13
14    // unicode.normalize(text: string, form: string) -> string
15    module.add_function_with_schema(
16        "normalize",
17        |args: &[ValueWord], _ctx: &ModuleContext| {
18            use unicode_normalization::UnicodeNormalization;
19
20            let text = args
21                .first()
22                .and_then(|a| a.as_str())
23                .ok_or_else(|| "unicode.normalize() requires a string argument".to_string())?;
24
25            let form = args
26                .get(1)
27                .and_then(|a| a.as_str())
28                .ok_or_else(|| {
29                    "unicode.normalize() requires a form argument (\"NFC\", \"NFD\", \"NFKC\", or \"NFKD\")"
30                        .to_string()
31                })?;
32
33            let normalized: String = match form {
34                "NFC" => text.nfc().collect(),
35                "NFD" => text.nfd().collect(),
36                "NFKC" => text.nfkc().collect(),
37                "NFKD" => text.nfkd().collect(),
38                _ => {
39                    return Err(format!(
40                        "unicode.normalize(): unknown form '{}', expected NFC/NFD/NFKC/NFKD",
41                        form
42                    ));
43                }
44            };
45
46            Ok(ValueWord::from_string(Arc::new(normalized)))
47        },
48        ModuleFunction {
49            description: "Normalize a Unicode string to the specified form".to_string(),
50            params: vec![
51                ModuleParam {
52                    name: "text".to_string(),
53                    type_name: "string".to_string(),
54                    required: true,
55                    description: "Text to normalize".to_string(),
56                    ..Default::default()
57                },
58                ModuleParam {
59                    name: "form".to_string(),
60                    type_name: "string".to_string(),
61                    required: true,
62                    description: "Normalization form: NFC, NFD, NFKC, or NFKD".to_string(),
63                    allowed_values: Some(vec![
64                        "NFC".to_string(),
65                        "NFD".to_string(),
66                        "NFKC".to_string(),
67                        "NFKD".to_string(),
68                    ]),
69                    ..Default::default()
70                },
71            ],
72            return_type: Some("string".to_string()),
73        },
74    );
75
76    // unicode.category(codepoint: int) -> string
77    module.add_function_with_schema(
78        "category",
79        |args: &[ValueWord], _ctx: &ModuleContext| {
80            let cp = args
81                .first()
82                .and_then(|a| a.as_i64().or_else(|| a.as_f64().map(|n| n as i64)))
83                .ok_or_else(|| {
84                    "unicode.category() requires an int argument (codepoint)".to_string()
85                })?;
86
87            let ch = char::from_u32(cp as u32)
88                .ok_or_else(|| format!("unicode.category(): invalid codepoint {}", cp))?;
89
90            let category = unicode_general_category(ch);
91            Ok(ValueWord::from_string(Arc::new(category.to_string())))
92        },
93        ModuleFunction {
94            description: "Get the Unicode general category of a codepoint".to_string(),
95            params: vec![ModuleParam {
96                name: "codepoint".to_string(),
97                type_name: "int".to_string(),
98                required: true,
99                description: "Unicode codepoint (e.g., 65 for 'A')".to_string(),
100                ..Default::default()
101            }],
102            return_type: Some("string".to_string()),
103        },
104    );
105
106    // unicode.is_letter(char: string) -> bool
107    module.add_function_with_schema(
108        "is_letter",
109        |args: &[ValueWord], _ctx: &ModuleContext| {
110            let s = args
111                .first()
112                .and_then(|a| a.as_str())
113                .ok_or_else(|| "unicode.is_letter() requires a string argument".to_string())?;
114
115            let result = s.chars().next().map_or(false, |c| c.is_alphabetic());
116            Ok(ValueWord::from_bool(result))
117        },
118        ModuleFunction {
119            description: "Check if the first character is a Unicode letter".to_string(),
120            params: vec![ModuleParam {
121                name: "char".to_string(),
122                type_name: "string".to_string(),
123                required: true,
124                description: "Single character string to check".to_string(),
125                ..Default::default()
126            }],
127            return_type: Some("bool".to_string()),
128        },
129    );
130
131    // unicode.is_digit(char: string) -> bool
132    module.add_function_with_schema(
133        "is_digit",
134        |args: &[ValueWord], _ctx: &ModuleContext| {
135            let s = args
136                .first()
137                .and_then(|a| a.as_str())
138                .ok_or_else(|| "unicode.is_digit() requires a string argument".to_string())?;
139
140            let result = s.chars().next().map_or(false, |c| c.is_numeric());
141            Ok(ValueWord::from_bool(result))
142        },
143        ModuleFunction {
144            description: "Check if the first character is a Unicode digit".to_string(),
145            params: vec![ModuleParam {
146                name: "char".to_string(),
147                type_name: "string".to_string(),
148                required: true,
149                description: "Single character string to check".to_string(),
150                ..Default::default()
151            }],
152            return_type: Some("bool".to_string()),
153        },
154    );
155
156    // unicode.graphemes(text: string) -> Array<string>
157    module.add_function_with_schema(
158        "graphemes",
159        |args: &[ValueWord], _ctx: &ModuleContext| {
160            use unicode_segmentation::UnicodeSegmentation;
161
162            let text = args
163                .first()
164                .and_then(|a| a.as_str())
165                .ok_or_else(|| "unicode.graphemes() requires a string argument".to_string())?;
166
167            let clusters: Vec<ValueWord> = text
168                .graphemes(true)
169                .map(|g| ValueWord::from_string(Arc::new(g.to_string())))
170                .collect();
171
172            Ok(ValueWord::from_array(Arc::new(clusters)))
173        },
174        ModuleFunction {
175            description: "Split a string into Unicode grapheme clusters".to_string(),
176            params: vec![ModuleParam {
177                name: "text".to_string(),
178                type_name: "string".to_string(),
179                required: true,
180                description: "Text to split into grapheme clusters".to_string(),
181                ..Default::default()
182            }],
183            return_type: Some("Array<string>".to_string()),
184        },
185    );
186
187    module
188}
189
190/// Approximate Unicode general category using std::char classification.
191fn unicode_general_category(ch: char) -> &'static str {
192    if ch.is_uppercase() {
193        "Lu"
194    } else if ch.is_lowercase() {
195        "Ll"
196    } else if ch.is_alphabetic() {
197        "Lo"
198    } else if ch.is_ascii_digit() {
199        "Nd"
200    } else if ch.is_numeric() {
201        "No"
202    } else if ch.is_whitespace() {
203        "Zs"
204    } else if ch.is_control() {
205        "Cc"
206    } else if ch.is_ascii_punctuation() {
207        "Po"
208    } else {
209        "Cn"
210    }
211}
212
213#[cfg(test)]
214mod tests {
215    use super::*;
216
217    fn test_ctx() -> crate::module_exports::ModuleContext<'static> {
218        let registry = Box::leak(Box::new(crate::type_schema::TypeSchemaRegistry::new()));
219        crate::module_exports::ModuleContext {
220            schemas: registry,
221            invoke_callable: None,
222            raw_invoker: None,
223            function_hashes: None,
224            vm_state: None,
225            granted_permissions: None,
226            scope_constraints: None,
227            set_pending_resume: None,
228            set_pending_frame_resume: None,
229        }
230    }
231
232    #[test]
233    fn test_unicode_module_creation() {
234        let module = create_unicode_module();
235        assert_eq!(module.name, "unicode");
236        assert!(module.has_export("normalize"));
237        assert!(module.has_export("category"));
238        assert!(module.has_export("is_letter"));
239        assert!(module.has_export("is_digit"));
240        assert!(module.has_export("graphemes"));
241    }
242
243    #[test]
244    fn test_normalize_nfc() {
245        let module = create_unicode_module();
246        let f = module.get_export("normalize").unwrap();
247        let ctx = test_ctx();
248        // e followed by combining acute accent
249        let input = ValueWord::from_string(Arc::new("e\u{0301}".to_string()));
250        let form = ValueWord::from_string(Arc::new("NFC".to_string()));
251        let result = f(&[input, form], &ctx).unwrap();
252        assert_eq!(result.as_str(), Some("\u{00e9}"));
253    }
254
255    #[test]
256    fn test_normalize_nfd() {
257        let module = create_unicode_module();
258        let f = module.get_export("normalize").unwrap();
259        let ctx = test_ctx();
260        let input = ValueWord::from_string(Arc::new("\u{00e9}".to_string()));
261        let form = ValueWord::from_string(Arc::new("NFD".to_string()));
262        let result = f(&[input, form], &ctx).unwrap();
263        assert_eq!(result.as_str(), Some("e\u{0301}"));
264    }
265
266    #[test]
267    fn test_normalize_invalid_form() {
268        let module = create_unicode_module();
269        let f = module.get_export("normalize").unwrap();
270        let ctx = test_ctx();
271        let input = ValueWord::from_string(Arc::new("hello".to_string()));
272        let form = ValueWord::from_string(Arc::new("INVALID".to_string()));
273        assert!(f(&[input, form], &ctx).is_err());
274    }
275
276    #[test]
277    fn test_category_uppercase() {
278        let module = create_unicode_module();
279        let f = module.get_export("category").unwrap();
280        let ctx = test_ctx();
281        let result = f(&[ValueWord::from_i64(65)], &ctx).unwrap(); // 'A'
282        assert_eq!(result.as_str(), Some("Lu"));
283    }
284
285    #[test]
286    fn test_category_lowercase() {
287        let module = create_unicode_module();
288        let f = module.get_export("category").unwrap();
289        let ctx = test_ctx();
290        let result = f(&[ValueWord::from_i64(97)], &ctx).unwrap(); // 'a'
291        assert_eq!(result.as_str(), Some("Ll"));
292    }
293
294    #[test]
295    fn test_category_digit() {
296        let module = create_unicode_module();
297        let f = module.get_export("category").unwrap();
298        let ctx = test_ctx();
299        let result = f(&[ValueWord::from_i64(48)], &ctx).unwrap(); // '0'
300        assert_eq!(result.as_str(), Some("Nd"));
301    }
302
303    #[test]
304    fn test_is_letter_alpha() {
305        let module = create_unicode_module();
306        let f = module.get_export("is_letter").unwrap();
307        let ctx = test_ctx();
308        let result = f(
309            &[ValueWord::from_string(Arc::new("\u{00e9}".to_string()))],
310            &ctx,
311        )
312        .unwrap();
313        assert_eq!(result.as_bool(), Some(true));
314    }
315
316    #[test]
317    fn test_is_letter_digit() {
318        let module = create_unicode_module();
319        let f = module.get_export("is_letter").unwrap();
320        let ctx = test_ctx();
321        let result = f(&[ValueWord::from_string(Arc::new("5".to_string()))], &ctx).unwrap();
322        assert_eq!(result.as_bool(), Some(false));
323    }
324
325    #[test]
326    fn test_is_digit_numeric() {
327        let module = create_unicode_module();
328        let f = module.get_export("is_digit").unwrap();
329        let ctx = test_ctx();
330        let result = f(&[ValueWord::from_string(Arc::new("7".to_string()))], &ctx).unwrap();
331        assert_eq!(result.as_bool(), Some(true));
332    }
333
334    #[test]
335    fn test_is_digit_alpha() {
336        let module = create_unicode_module();
337        let f = module.get_export("is_digit").unwrap();
338        let ctx = test_ctx();
339        let result = f(&[ValueWord::from_string(Arc::new("a".to_string()))], &ctx).unwrap();
340        assert_eq!(result.as_bool(), Some(false));
341    }
342
343    #[test]
344    fn test_graphemes_emoji() {
345        let module = create_unicode_module();
346        let f = module.get_export("graphemes").unwrap();
347        let ctx = test_ctx();
348        // Family emoji (multiple codepoints, single grapheme cluster)
349        let input = ValueWord::from_string(Arc::new("hello".to_string()));
350        let result = f(&[input], &ctx).unwrap();
351        let arr = result.as_any_array().unwrap().to_generic();
352        assert_eq!(arr.len(), 5);
353        assert_eq!(arr[0].as_str(), Some("h"));
354        assert_eq!(arr[4].as_str(), Some("o"));
355    }
356
357    #[test]
358    fn test_graphemes_combining() {
359        let module = create_unicode_module();
360        let f = module.get_export("graphemes").unwrap();
361        let ctx = test_ctx();
362        // "e" + combining acute = one grapheme cluster
363        let input = ValueWord::from_string(Arc::new("e\u{0301}a".to_string()));
364        let result = f(&[input], &ctx).unwrap();
365        let arr = result.as_any_array().unwrap().to_generic();
366        assert_eq!(arr.len(), 2); // "e\u{0301}" and "a"
367    }
368}