Skip to main content

shape_runtime/stdlib/
regex.rs

1//! Native `regex` module for regular expression operations.
2//!
3//! Exports: regex.match, regex.match_all, regex.replace, regex.replace_all,
4//!          regex.is_match, regex.split
5
6use crate::module_exports::{ModuleContext, ModuleExports, ModuleFunction, ModuleParam};
7use shape_value::ValueWord;
8use std::sync::Arc;
9
10/// Build a match result object as a ValueWord HashMap.
11/// Fields: text (string), start (number), end (number), groups (array of strings).
12fn match_to_nanboxed(m: &regex::Match, captures: &regex::Captures) -> ValueWord {
13    let mut keys = Vec::with_capacity(4);
14    let mut values = Vec::with_capacity(4);
15
16    keys.push(ValueWord::from_string(Arc::new("text".to_string())));
17    values.push(ValueWord::from_string(Arc::new(m.as_str().to_string())));
18
19    keys.push(ValueWord::from_string(Arc::new("start".to_string())));
20    values.push(ValueWord::from_f64(m.start() as f64));
21
22    keys.push(ValueWord::from_string(Arc::new("end".to_string())));
23    values.push(ValueWord::from_f64(m.end() as f64));
24
25    let groups: Vec<ValueWord> = captures
26        .iter()
27        .skip(1)
28        .map(|opt| match opt {
29            Some(g) => ValueWord::from_string(Arc::new(g.as_str().to_string())),
30            None => ValueWord::none(),
31        })
32        .collect();
33    keys.push(ValueWord::from_string(Arc::new("groups".to_string())));
34    values.push(ValueWord::from_array(Arc::new(groups)));
35
36    ValueWord::from_hashmap_pairs(keys, values)
37}
38
39/// Create the `regex` module with regular expression functions.
40pub fn create_regex_module() -> ModuleExports {
41    let mut module = ModuleExports::new("regex");
42    module.description = "Regular expression matching and replacement".to_string();
43
44    // regex.is_match(text: string, pattern: string) -> bool
45    module.add_function_with_schema(
46        "is_match",
47        |args: &[ValueWord], _ctx: &ModuleContext| {
48            let text = args
49                .first()
50                .and_then(|a| a.as_str())
51                .ok_or_else(|| "regex.is_match() requires a text string argument".to_string())?;
52
53            let pattern = args
54                .get(1)
55                .and_then(|a| a.as_str())
56                .ok_or_else(|| "regex.is_match() requires a pattern string argument".to_string())?;
57
58            let re = regex::Regex::new(pattern)
59                .map_err(|e| format!("regex.is_match() invalid pattern: {}", e))?;
60
61            Ok(ValueWord::from_bool(re.is_match(text)))
62        },
63        ModuleFunction {
64            description: "Test whether the pattern matches anywhere in the text".to_string(),
65            params: vec![
66                ModuleParam {
67                    name: "text".to_string(),
68                    type_name: "string".to_string(),
69                    required: true,
70                    description: "Text to search".to_string(),
71                    ..Default::default()
72                },
73                ModuleParam {
74                    name: "pattern".to_string(),
75                    type_name: "string".to_string(),
76                    required: true,
77                    description: "Regular expression pattern".to_string(),
78                    ..Default::default()
79                },
80            ],
81            return_type: Some("bool".to_string()),
82        },
83    );
84
85    // regex.match(text: string, pattern: string) -> Option<object>
86    module.add_function_with_schema(
87        "match",
88        |args: &[ValueWord], _ctx: &ModuleContext| {
89            let text = args
90                .first()
91                .and_then(|a| a.as_str())
92                .ok_or_else(|| "regex.match() requires a text string argument".to_string())?;
93
94            let pattern = args
95                .get(1)
96                .and_then(|a| a.as_str())
97                .ok_or_else(|| "regex.match() requires a pattern string argument".to_string())?;
98
99            let re = regex::Regex::new(pattern)
100                .map_err(|e| format!("regex.match() invalid pattern: {}", e))?;
101
102            match re.captures(text) {
103                Some(caps) => {
104                    let m = caps.get(0).unwrap();
105                    Ok(ValueWord::from_some(match_to_nanboxed(&m, &caps)))
106                }
107                None => Ok(ValueWord::none()),
108            }
109        },
110        ModuleFunction {
111            description: "Find the first match of the pattern, returning a match object or none"
112                .to_string(),
113            params: vec![
114                ModuleParam {
115                    name: "text".to_string(),
116                    type_name: "string".to_string(),
117                    required: true,
118                    description: "Text to search".to_string(),
119                    ..Default::default()
120                },
121                ModuleParam {
122                    name: "pattern".to_string(),
123                    type_name: "string".to_string(),
124                    required: true,
125                    description: "Regular expression pattern".to_string(),
126                    ..Default::default()
127                },
128            ],
129            return_type: Some("Option<object>".to_string()),
130        },
131    );
132
133    // regex.match_all(text: string, pattern: string) -> Array<object>
134    module.add_function_with_schema(
135        "match_all",
136        |args: &[ValueWord], _ctx: &ModuleContext| {
137            let text = args
138                .first()
139                .and_then(|a| a.as_str())
140                .ok_or_else(|| "regex.match_all() requires a text string argument".to_string())?;
141
142            let pattern = args.get(1).and_then(|a| a.as_str()).ok_or_else(|| {
143                "regex.match_all() requires a pattern string argument".to_string()
144            })?;
145
146            let re = regex::Regex::new(pattern)
147                .map_err(|e| format!("regex.match_all() invalid pattern: {}", e))?;
148
149            let matches: Vec<ValueWord> = re
150                .captures_iter(text)
151                .map(|caps| {
152                    let m = caps.get(0).unwrap();
153                    match_to_nanboxed(&m, &caps)
154                })
155                .collect();
156
157            Ok(ValueWord::from_array(Arc::new(matches)))
158        },
159        ModuleFunction {
160            description: "Find all non-overlapping matches of the pattern".to_string(),
161            params: vec![
162                ModuleParam {
163                    name: "text".to_string(),
164                    type_name: "string".to_string(),
165                    required: true,
166                    description: "Text to search".to_string(),
167                    ..Default::default()
168                },
169                ModuleParam {
170                    name: "pattern".to_string(),
171                    type_name: "string".to_string(),
172                    required: true,
173                    description: "Regular expression pattern".to_string(),
174                    ..Default::default()
175                },
176            ],
177            return_type: Some("Array<object>".to_string()),
178        },
179    );
180
181    // regex.replace(text: string, pattern: string, replacement: string) -> string
182    module.add_function_with_schema(
183        "replace",
184        |args: &[ValueWord], _ctx: &ModuleContext| {
185            let text = args
186                .first()
187                .and_then(|a| a.as_str())
188                .ok_or_else(|| "regex.replace() requires a text string argument".to_string())?;
189
190            let pattern = args
191                .get(1)
192                .and_then(|a| a.as_str())
193                .ok_or_else(|| "regex.replace() requires a pattern string argument".to_string())?;
194
195            let replacement = args.get(2).and_then(|a| a.as_str()).ok_or_else(|| {
196                "regex.replace() requires a replacement string argument".to_string()
197            })?;
198
199            let re = regex::Regex::new(pattern)
200                .map_err(|e| format!("regex.replace() invalid pattern: {}", e))?;
201
202            let result = re.replace(text, replacement);
203            Ok(ValueWord::from_string(Arc::new(result.into_owned())))
204        },
205        ModuleFunction {
206            description: "Replace the first match of the pattern with the replacement".to_string(),
207            params: vec![
208                ModuleParam {
209                    name: "text".to_string(),
210                    type_name: "string".to_string(),
211                    required: true,
212                    description: "Text to search".to_string(),
213                    ..Default::default()
214                },
215                ModuleParam {
216                    name: "pattern".to_string(),
217                    type_name: "string".to_string(),
218                    required: true,
219                    description: "Regular expression pattern".to_string(),
220                    ..Default::default()
221                },
222                ModuleParam {
223                    name: "replacement".to_string(),
224                    type_name: "string".to_string(),
225                    required: true,
226                    description: "Replacement string (supports $1, $2 for capture groups)"
227                        .to_string(),
228                    ..Default::default()
229                },
230            ],
231            return_type: Some("string".to_string()),
232        },
233    );
234
235    // regex.replace_all(text: string, pattern: string, replacement: string) -> string
236    module.add_function_with_schema(
237        "replace_all",
238        |args: &[ValueWord], _ctx: &ModuleContext| {
239            let text = args
240                .first()
241                .and_then(|a| a.as_str())
242                .ok_or_else(|| "regex.replace_all() requires a text string argument".to_string())?;
243
244            let pattern = args.get(1).and_then(|a| a.as_str()).ok_or_else(|| {
245                "regex.replace_all() requires a pattern string argument".to_string()
246            })?;
247
248            let replacement = args.get(2).and_then(|a| a.as_str()).ok_or_else(|| {
249                "regex.replace_all() requires a replacement string argument".to_string()
250            })?;
251
252            let re = regex::Regex::new(pattern)
253                .map_err(|e| format!("regex.replace_all() invalid pattern: {}", e))?;
254
255            let result = re.replace_all(text, replacement);
256            Ok(ValueWord::from_string(Arc::new(result.into_owned())))
257        },
258        ModuleFunction {
259            description: "Replace all matches of the pattern with the replacement".to_string(),
260            params: vec![
261                ModuleParam {
262                    name: "text".to_string(),
263                    type_name: "string".to_string(),
264                    required: true,
265                    description: "Text to search".to_string(),
266                    ..Default::default()
267                },
268                ModuleParam {
269                    name: "pattern".to_string(),
270                    type_name: "string".to_string(),
271                    required: true,
272                    description: "Regular expression pattern".to_string(),
273                    ..Default::default()
274                },
275                ModuleParam {
276                    name: "replacement".to_string(),
277                    type_name: "string".to_string(),
278                    required: true,
279                    description: "Replacement string (supports $1, $2 for capture groups)"
280                        .to_string(),
281                    ..Default::default()
282                },
283            ],
284            return_type: Some("string".to_string()),
285        },
286    );
287
288    // regex.split(text: string, pattern: string) -> Array<string>
289    module.add_function_with_schema(
290        "split",
291        |args: &[ValueWord], _ctx: &ModuleContext| {
292            let text = args
293                .first()
294                .and_then(|a| a.as_str())
295                .ok_or_else(|| "regex.split() requires a text string argument".to_string())?;
296
297            let pattern = args
298                .get(1)
299                .and_then(|a| a.as_str())
300                .ok_or_else(|| "regex.split() requires a pattern string argument".to_string())?;
301
302            let re = regex::Regex::new(pattern)
303                .map_err(|e| format!("regex.split() invalid pattern: {}", e))?;
304
305            let parts: Vec<ValueWord> = re
306                .split(text)
307                .map(|s| ValueWord::from_string(Arc::new(s.to_string())))
308                .collect();
309
310            Ok(ValueWord::from_array(Arc::new(parts)))
311        },
312        ModuleFunction {
313            description: "Split the text at each match of the pattern".to_string(),
314            params: vec![
315                ModuleParam {
316                    name: "text".to_string(),
317                    type_name: "string".to_string(),
318                    required: true,
319                    description: "Text to split".to_string(),
320                    ..Default::default()
321                },
322                ModuleParam {
323                    name: "pattern".to_string(),
324                    type_name: "string".to_string(),
325                    required: true,
326                    description: "Regular expression pattern to split on".to_string(),
327                    ..Default::default()
328                },
329            ],
330            return_type: Some("Array<string>".to_string()),
331        },
332    );
333
334    module
335}
336
337#[cfg(test)]
338mod tests {
339    use super::*;
340
341    fn s(val: &str) -> ValueWord {
342        ValueWord::from_string(Arc::new(val.to_string()))
343    }
344
345    fn test_ctx() -> crate::module_exports::ModuleContext<'static> {
346        let registry = Box::leak(Box::new(crate::type_schema::TypeSchemaRegistry::new()));
347        crate::module_exports::ModuleContext {
348            schemas: registry,
349            invoke_callable: None,
350            raw_invoker: None,
351            function_hashes: None,
352            vm_state: None,
353            granted_permissions: None,
354            scope_constraints: None,
355            set_pending_resume: None,
356            set_pending_frame_resume: None,
357        }
358    }
359
360    #[test]
361    fn test_regex_module_creation() {
362        let module = create_regex_module();
363        assert_eq!(module.name, "regex");
364        assert!(module.has_export("is_match"));
365        assert!(module.has_export("match"));
366        assert!(module.has_export("match_all"));
367        assert!(module.has_export("replace"));
368        assert!(module.has_export("replace_all"));
369        assert!(module.has_export("split"));
370    }
371
372    #[test]
373    fn test_is_match_true() {
374        let module = create_regex_module();
375        let ctx = test_ctx();
376        let f = module.get_export("is_match").unwrap();
377        let result = f(&[s("hello world"), s(r"\bworld\b")], &ctx).unwrap();
378        assert_eq!(result.as_bool(), Some(true));
379    }
380
381    #[test]
382    fn test_is_match_false() {
383        let module = create_regex_module();
384        let ctx = test_ctx();
385        let f = module.get_export("is_match").unwrap();
386        let result = f(&[s("hello world"), s(r"^\d+$")], &ctx).unwrap();
387        assert_eq!(result.as_bool(), Some(false));
388    }
389
390    #[test]
391    fn test_is_match_invalid_pattern() {
392        let module = create_regex_module();
393        let ctx = test_ctx();
394        let f = module.get_export("is_match").unwrap();
395        assert!(f(&[s("text"), s("[invalid")], &ctx).is_err());
396    }
397
398    #[test]
399    fn test_match_found() {
400        let module = create_regex_module();
401        let ctx = test_ctx();
402        let f = module.get_export("match").unwrap();
403        let result = f(&[s("abc 123 def"), s(r"(\d+)")], &ctx).unwrap();
404        // Should be Some(match_object)
405        let inner = result.as_some_inner().expect("should be Some");
406        let (keys, values, _) = inner.as_hashmap().expect("should be hashmap");
407        // Find "text" field
408        let text_idx = keys
409            .iter()
410            .position(|k| k.as_str() == Some("text"))
411            .unwrap();
412        assert_eq!(values[text_idx].as_str(), Some("123"));
413    }
414
415    #[test]
416    fn test_match_not_found() {
417        let module = create_regex_module();
418        let ctx = test_ctx();
419        let f = module.get_export("match").unwrap();
420        let result = f(&[s("abc def"), s(r"\d+")], &ctx).unwrap();
421        assert!(result.is_none());
422    }
423
424    #[test]
425    fn test_match_all() {
426        let module = create_regex_module();
427        let ctx = test_ctx();
428        let f = module.get_export("match_all").unwrap();
429        let result = f(&[s("a1 b2 c3"), s(r"\d")], &ctx).unwrap();
430        let arr = result.as_any_array().expect("should be array").to_generic();
431        assert_eq!(arr.len(), 3);
432    }
433
434    #[test]
435    fn test_match_all_no_matches() {
436        let module = create_regex_module();
437        let ctx = test_ctx();
438        let f = module.get_export("match_all").unwrap();
439        let result = f(&[s("abc"), s(r"\d+")], &ctx).unwrap();
440        let arr = result.as_any_array().expect("should be array").to_generic();
441        assert_eq!(arr.len(), 0);
442    }
443
444    #[test]
445    fn test_replace_first() {
446        let module = create_regex_module();
447        let ctx = test_ctx();
448        let f = module.get_export("replace").unwrap();
449        let result = f(&[s("foo bar foo"), s("foo"), s("baz")], &ctx).unwrap();
450        assert_eq!(result.as_str(), Some("baz bar foo"));
451    }
452
453    #[test]
454    fn test_replace_all() {
455        let module = create_regex_module();
456        let ctx = test_ctx();
457        let f = module.get_export("replace_all").unwrap();
458        let result = f(&[s("foo bar foo"), s("foo"), s("baz")], &ctx).unwrap();
459        assert_eq!(result.as_str(), Some("baz bar baz"));
460    }
461
462    #[test]
463    fn test_replace_with_capture_group() {
464        let module = create_regex_module();
465        let ctx = test_ctx();
466        let f = module.get_export("replace_all").unwrap();
467        let result = f(
468            &[
469                s("2024-01-15"),
470                s(r"(\d{4})-(\d{2})-(\d{2})"),
471                s("$3/$2/$1"),
472            ],
473            &ctx,
474        )
475        .unwrap();
476        assert_eq!(result.as_str(), Some("15/01/2024"));
477    }
478
479    #[test]
480    fn test_split() {
481        let module = create_regex_module();
482        let ctx = test_ctx();
483        let f = module.get_export("split").unwrap();
484        let result = f(&[s("one,two,,three"), s(",")], &ctx).unwrap();
485        let arr = result.as_any_array().expect("should be array").to_generic();
486        assert_eq!(arr.len(), 4);
487        assert_eq!(arr[0].as_str(), Some("one"));
488        assert_eq!(arr[1].as_str(), Some("two"));
489        assert_eq!(arr[2].as_str(), Some(""));
490        assert_eq!(arr[3].as_str(), Some("three"));
491    }
492
493    #[test]
494    fn test_split_by_whitespace() {
495        let module = create_regex_module();
496        let ctx = test_ctx();
497        let f = module.get_export("split").unwrap();
498        let result = f(&[s("hello   world  test"), s(r"\s+")], &ctx).unwrap();
499        let arr = result.as_any_array().expect("should be array").to_generic();
500        assert_eq!(arr.len(), 3);
501        assert_eq!(arr[0].as_str(), Some("hello"));
502        assert_eq!(arr[1].as_str(), Some("world"));
503        assert_eq!(arr[2].as_str(), Some("test"));
504    }
505
506    #[test]
507    fn test_regex_schemas() {
508        let module = create_regex_module();
509
510        let match_schema = module.get_schema("match").unwrap();
511        assert_eq!(match_schema.params.len(), 2);
512        assert_eq!(match_schema.return_type.as_deref(), Some("Option<object>"));
513
514        let replace_schema = module.get_schema("replace").unwrap();
515        assert_eq!(replace_schema.params.len(), 3);
516
517        let split_schema = module.get_schema("split").unwrap();
518        assert_eq!(split_schema.return_type.as_deref(), Some("Array<string>"));
519    }
520}