1use crate::module_exports::{ModuleContext, ModuleExports, ModuleFunction, ModuleParam};
6use shape_value::ValueWord;
7use std::sync::Arc;
8
9pub fn create_unicode_module() -> ModuleExports {
11 let mut module = ModuleExports::new("std::core::unicode");
12 module.description = "Unicode text processing utilities".to_string();
13
14 module.add_function_with_schema(
16 "normalize",
17 |args: &[ValueWord], _ctx: &ModuleContext| {
18 use unicode_normalization::UnicodeNormalization;
19
20 let text = args
21 .first()
22 .and_then(|a| a.as_str())
23 .ok_or_else(|| "unicode.normalize() requires a string argument".to_string())?;
24
25 let form = args
26 .get(1)
27 .and_then(|a| a.as_str())
28 .ok_or_else(|| {
29 "unicode.normalize() requires a form argument (\"NFC\", \"NFD\", \"NFKC\", or \"NFKD\")"
30 .to_string()
31 })?;
32
33 let normalized: String = match form {
34 "NFC" => text.nfc().collect(),
35 "NFD" => text.nfd().collect(),
36 "NFKC" => text.nfkc().collect(),
37 "NFKD" => text.nfkd().collect(),
38 _ => {
39 return Err(format!(
40 "unicode.normalize(): unknown form '{}', expected NFC/NFD/NFKC/NFKD",
41 form
42 ));
43 }
44 };
45
46 Ok(ValueWord::from_string(Arc::new(normalized)))
47 },
48 ModuleFunction {
49 description: "Normalize a Unicode string to the specified form".to_string(),
50 params: vec![
51 ModuleParam {
52 name: "text".to_string(),
53 type_name: "string".to_string(),
54 required: true,
55 description: "Text to normalize".to_string(),
56 ..Default::default()
57 },
58 ModuleParam {
59 name: "form".to_string(),
60 type_name: "string".to_string(),
61 required: true,
62 description: "Normalization form: NFC, NFD, NFKC, or NFKD".to_string(),
63 allowed_values: Some(vec![
64 "NFC".to_string(),
65 "NFD".to_string(),
66 "NFKC".to_string(),
67 "NFKD".to_string(),
68 ]),
69 ..Default::default()
70 },
71 ],
72 return_type: Some("string".to_string()),
73 },
74 );
75
76 module.add_function_with_schema(
78 "category",
79 |args: &[ValueWord], _ctx: &ModuleContext| {
80 let cp = args
81 .first()
82 .and_then(|a| a.as_i64().or_else(|| a.as_f64().map(|n| n as i64)))
83 .ok_or_else(|| {
84 "unicode.category() requires an int argument (codepoint)".to_string()
85 })?;
86
87 let ch = char::from_u32(cp as u32)
88 .ok_or_else(|| format!("unicode.category(): invalid codepoint {}", cp))?;
89
90 let category = unicode_general_category(ch);
91 Ok(ValueWord::from_string(Arc::new(category.to_string())))
92 },
93 ModuleFunction {
94 description: "Get the Unicode general category of a codepoint".to_string(),
95 params: vec![ModuleParam {
96 name: "codepoint".to_string(),
97 type_name: "int".to_string(),
98 required: true,
99 description: "Unicode codepoint (e.g., 65 for 'A')".to_string(),
100 ..Default::default()
101 }],
102 return_type: Some("string".to_string()),
103 },
104 );
105
106 module.add_function_with_schema(
108 "is_letter",
109 |args: &[ValueWord], _ctx: &ModuleContext| {
110 let s = args
111 .first()
112 .and_then(|a| a.as_str())
113 .ok_or_else(|| "unicode.is_letter() requires a string argument".to_string())?;
114
115 let result = s.chars().next().map_or(false, |c| c.is_alphabetic());
116 Ok(ValueWord::from_bool(result))
117 },
118 ModuleFunction {
119 description: "Check if the first character is a Unicode letter".to_string(),
120 params: vec![ModuleParam {
121 name: "char".to_string(),
122 type_name: "string".to_string(),
123 required: true,
124 description: "Single character string to check".to_string(),
125 ..Default::default()
126 }],
127 return_type: Some("bool".to_string()),
128 },
129 );
130
131 module.add_function_with_schema(
133 "is_digit",
134 |args: &[ValueWord], _ctx: &ModuleContext| {
135 let s = args
136 .first()
137 .and_then(|a| a.as_str())
138 .ok_or_else(|| "unicode.is_digit() requires a string argument".to_string())?;
139
140 let result = s.chars().next().map_or(false, |c| c.is_numeric());
141 Ok(ValueWord::from_bool(result))
142 },
143 ModuleFunction {
144 description: "Check if the first character is a Unicode digit".to_string(),
145 params: vec![ModuleParam {
146 name: "char".to_string(),
147 type_name: "string".to_string(),
148 required: true,
149 description: "Single character string to check".to_string(),
150 ..Default::default()
151 }],
152 return_type: Some("bool".to_string()),
153 },
154 );
155
156 module.add_function_with_schema(
158 "graphemes",
159 |args: &[ValueWord], _ctx: &ModuleContext| {
160 use unicode_segmentation::UnicodeSegmentation;
161
162 let text = args
163 .first()
164 .and_then(|a| a.as_str())
165 .ok_or_else(|| "unicode.graphemes() requires a string argument".to_string())?;
166
167 let clusters: Vec<ValueWord> = text
168 .graphemes(true)
169 .map(|g| ValueWord::from_string(Arc::new(g.to_string())))
170 .collect();
171
172 Ok(ValueWord::from_array(Arc::new(clusters)))
173 },
174 ModuleFunction {
175 description: "Split a string into Unicode grapheme clusters".to_string(),
176 params: vec![ModuleParam {
177 name: "text".to_string(),
178 type_name: "string".to_string(),
179 required: true,
180 description: "Text to split into grapheme clusters".to_string(),
181 ..Default::default()
182 }],
183 return_type: Some("Array<string>".to_string()),
184 },
185 );
186
187 module
188}
189
190fn unicode_general_category(ch: char) -> &'static str {
192 if ch.is_uppercase() {
193 "Lu"
194 } else if ch.is_lowercase() {
195 "Ll"
196 } else if ch.is_alphabetic() {
197 "Lo"
198 } else if ch.is_ascii_digit() {
199 "Nd"
200 } else if ch.is_numeric() {
201 "No"
202 } else if ch.is_whitespace() {
203 "Zs"
204 } else if ch.is_control() {
205 "Cc"
206 } else if ch.is_ascii_punctuation() {
207 "Po"
208 } else {
209 "Cn"
210 }
211}
212
213#[cfg(test)]
214mod tests {
215 use super::*;
216
217 fn test_ctx() -> crate::module_exports::ModuleContext<'static> {
218 let registry = Box::leak(Box::new(crate::type_schema::TypeSchemaRegistry::new()));
219 crate::module_exports::ModuleContext {
220 schemas: registry,
221 invoke_callable: None,
222 raw_invoker: None,
223 function_hashes: None,
224 vm_state: None,
225 granted_permissions: None,
226 scope_constraints: None,
227 set_pending_resume: None,
228 set_pending_frame_resume: None,
229 }
230 }
231
232 #[test]
233 fn test_unicode_module_creation() {
234 let module = create_unicode_module();
235 assert_eq!(module.name, "std::core::unicode");
236 assert!(module.has_export("normalize"));
237 assert!(module.has_export("category"));
238 assert!(module.has_export("is_letter"));
239 assert!(module.has_export("is_digit"));
240 assert!(module.has_export("graphemes"));
241 }
242
243 #[test]
244 fn test_normalize_nfc() {
245 let module = create_unicode_module();
246 let f = module.get_export("normalize").unwrap();
247 let ctx = test_ctx();
248 let input = ValueWord::from_string(Arc::new("e\u{0301}".to_string()));
250 let form = ValueWord::from_string(Arc::new("NFC".to_string()));
251 let result = f(&[input, form], &ctx).unwrap();
252 assert_eq!(result.as_str(), Some("\u{00e9}"));
253 }
254
255 #[test]
256 fn test_normalize_nfd() {
257 let module = create_unicode_module();
258 let f = module.get_export("normalize").unwrap();
259 let ctx = test_ctx();
260 let input = ValueWord::from_string(Arc::new("\u{00e9}".to_string()));
261 let form = ValueWord::from_string(Arc::new("NFD".to_string()));
262 let result = f(&[input, form], &ctx).unwrap();
263 assert_eq!(result.as_str(), Some("e\u{0301}"));
264 }
265
266 #[test]
267 fn test_normalize_invalid_form() {
268 let module = create_unicode_module();
269 let f = module.get_export("normalize").unwrap();
270 let ctx = test_ctx();
271 let input = ValueWord::from_string(Arc::new("hello".to_string()));
272 let form = ValueWord::from_string(Arc::new("INVALID".to_string()));
273 assert!(f(&[input, form], &ctx).is_err());
274 }
275
276 #[test]
277 fn test_category_uppercase() {
278 let module = create_unicode_module();
279 let f = module.get_export("category").unwrap();
280 let ctx = test_ctx();
281 let result = f(&[ValueWord::from_i64(65)], &ctx).unwrap(); assert_eq!(result.as_str(), Some("Lu"));
283 }
284
285 #[test]
286 fn test_category_lowercase() {
287 let module = create_unicode_module();
288 let f = module.get_export("category").unwrap();
289 let ctx = test_ctx();
290 let result = f(&[ValueWord::from_i64(97)], &ctx).unwrap(); assert_eq!(result.as_str(), Some("Ll"));
292 }
293
294 #[test]
295 fn test_category_digit() {
296 let module = create_unicode_module();
297 let f = module.get_export("category").unwrap();
298 let ctx = test_ctx();
299 let result = f(&[ValueWord::from_i64(48)], &ctx).unwrap(); assert_eq!(result.as_str(), Some("Nd"));
301 }
302
303 #[test]
304 fn test_is_letter_alpha() {
305 let module = create_unicode_module();
306 let f = module.get_export("is_letter").unwrap();
307 let ctx = test_ctx();
308 let result = f(
309 &[ValueWord::from_string(Arc::new("\u{00e9}".to_string()))],
310 &ctx,
311 )
312 .unwrap();
313 assert_eq!(result.as_bool(), Some(true));
314 }
315
316 #[test]
317 fn test_is_letter_digit() {
318 let module = create_unicode_module();
319 let f = module.get_export("is_letter").unwrap();
320 let ctx = test_ctx();
321 let result = f(&[ValueWord::from_string(Arc::new("5".to_string()))], &ctx).unwrap();
322 assert_eq!(result.as_bool(), Some(false));
323 }
324
325 #[test]
326 fn test_is_digit_numeric() {
327 let module = create_unicode_module();
328 let f = module.get_export("is_digit").unwrap();
329 let ctx = test_ctx();
330 let result = f(&[ValueWord::from_string(Arc::new("7".to_string()))], &ctx).unwrap();
331 assert_eq!(result.as_bool(), Some(true));
332 }
333
334 #[test]
335 fn test_is_digit_alpha() {
336 let module = create_unicode_module();
337 let f = module.get_export("is_digit").unwrap();
338 let ctx = test_ctx();
339 let result = f(&[ValueWord::from_string(Arc::new("a".to_string()))], &ctx).unwrap();
340 assert_eq!(result.as_bool(), Some(false));
341 }
342
343 #[test]
344 fn test_graphemes_emoji() {
345 let module = create_unicode_module();
346 let f = module.get_export("graphemes").unwrap();
347 let ctx = test_ctx();
348 let input = ValueWord::from_string(Arc::new("hello".to_string()));
350 let result = f(&[input], &ctx).unwrap();
351 let arr = result.as_any_array().unwrap().to_generic();
352 assert_eq!(arr.len(), 5);
353 assert_eq!(arr[0].as_str(), Some("h"));
354 assert_eq!(arr[4].as_str(), Some("o"));
355 }
356
357 #[test]
358 fn test_graphemes_combining() {
359 let module = create_unicode_module();
360 let f = module.get_export("graphemes").unwrap();
361 let ctx = test_ctx();
362 let input = ValueWord::from_string(Arc::new("e\u{0301}a".to_string()));
364 let result = f(&[input], &ctx).unwrap();
365 let arr = result.as_any_array().unwrap().to_generic();
366 assert_eq!(arr.len(), 2); }
368}