Skip to main content

xsd_schema/xpath/functions/
regex.rs

1//! XPath 2.0 regex functions.
2//!
3//! This module implements:
4//! - fn:matches($input, $pattern, $flags?) - test if string matches pattern
5//! - fn:replace($input, $pattern, $replacement, $flags?) - replace matches
6//! - fn:tokenize($input, $pattern, $flags?) - split string by pattern
7//!
8//! Uses the `regexml` crate for native XML Schema 1.1 regex with full Unicode support.
9
10use regexml::Regex;
11
12use crate::xpath::context::DynamicContext;
13use crate::xpath::error::XPathError;
14use crate::xpath::DomNavigator;
15
16use super::{atomize_to_string, atomize_to_string_opt, atomize_to_string_required, XPathValue};
17use crate::types::value::XmlValue;
18use crate::xpath::iterator::XmlItem;
19
20/// fn:matches($input as xs:string?, $pattern as xs:string, $flags as xs:string?) as xs:boolean
21///
22/// Returns true if $input matches the regular expression $pattern.
23///
24/// - If $input is empty, it is treated as empty string.
25/// - FORX0001 if $flags contains invalid characters.
26/// - FORX0002 if $pattern is not a valid regular expression.
27pub fn matches<N: DomNavigator>(
28    _context: &mut DynamicContext<'_, N>,
29    mut args: Vec<XPathValue<N>>,
30) -> Result<XPathValue<N>, XPathError> {
31    if args.len() < 2 || args.len() > 3 {
32        return Err(XPathError::wrong_number_of_arguments(
33            "matches",
34            2,
35            args.len(),
36        ));
37    }
38
39    // Get flags (optional third argument)
40    let flags = if args.len() == 3 {
41        atomize_to_string_opt(args.pop().unwrap())?
42    } else {
43        None
44    };
45
46    // Get pattern (second argument)
47    let pattern = atomize_to_string_required(args.pop().unwrap())?;
48
49    // Get input (first argument)
50    let input = atomize_to_string(args.pop().unwrap())?;
51
52    let flags_str = flags.as_deref().unwrap_or("");
53
54    // Build the regex
55    let regex = build_regex(&pattern, flags_str)?;
56
57    let result = regex.is_match(&input);
58
59    Ok(XPathValue::boolean(result))
60}
61
62/// fn:replace($input as xs:string?, $pattern as xs:string, $replacement as xs:string,
63///            $flags as xs:string?) as xs:string
64///
65/// Replaces all occurrences of $pattern in $input with $replacement.
66///
67/// - FORX0001 if $flags contains invalid characters.
68/// - FORX0002 if $pattern is not a valid regular expression.
69/// - FORX0003 if $pattern matches a zero-length string.
70/// - FORX0004 if $replacement has invalid syntax.
71pub fn replace<N: DomNavigator>(
72    _context: &mut DynamicContext<'_, N>,
73    mut args: Vec<XPathValue<N>>,
74) -> Result<XPathValue<N>, XPathError> {
75    if args.len() < 3 || args.len() > 4 {
76        return Err(XPathError::wrong_number_of_arguments(
77            "replace",
78            3,
79            args.len(),
80        ));
81    }
82
83    // Get flags (optional fourth argument)
84    let flags = if args.len() == 4 {
85        atomize_to_string_opt(args.pop().unwrap())?
86    } else {
87        None
88    };
89
90    // Get replacement (third argument)
91    let replacement = atomize_to_string_required(args.pop().unwrap())?;
92
93    // Get pattern (second argument)
94    let pattern = atomize_to_string_required(args.pop().unwrap())?;
95
96    // Get input (first argument)
97    let input = atomize_to_string(args.pop().unwrap())?;
98
99    // Build the regex
100    let regex = build_regex(&pattern, flags.as_deref().unwrap_or(""))?;
101
102    // regexml handles FORX0003 (zero-length match) and FORX0004 (invalid replacement) internally
103    let result = regex
104        .replace_all(&input, &replacement)
105        .map_err(|e| match e {
106            regexml::Error::MatchesEmptyString => XPathError::regex_matches_zero_length(&pattern),
107            regexml::Error::InvalidReplacementString(_) => {
108                XPathError::invalid_replacement_string(&replacement)
109            }
110            _ => XPathError::invalid_regex_pattern(&pattern),
111        })?;
112
113    Ok(XPathValue::string(result))
114}
115
116/// fn:tokenize($input as xs:string?, $pattern as xs:string, $flags as xs:string?) as xs:string*
117///
118/// Splits $input into a sequence of strings using $pattern as delimiter.
119///
120/// - FORX0001 if $flags contains invalid characters.
121/// - FORX0002 if $pattern is not a valid regular expression.
122/// - FORX0003 if $pattern matches a zero-length string.
123pub fn tokenize<N: DomNavigator>(
124    _context: &mut DynamicContext<'_, N>,
125    mut args: Vec<XPathValue<N>>,
126) -> Result<XPathValue<N>, XPathError> {
127    if args.len() < 2 || args.len() > 3 {
128        return Err(XPathError::wrong_number_of_arguments(
129            "tokenize",
130            2,
131            args.len(),
132        ));
133    }
134
135    // Get flags (optional third argument)
136    let flags = if args.len() == 3 {
137        atomize_to_string_opt(args.pop().unwrap())?
138    } else {
139        None
140    };
141
142    // Get pattern (second argument)
143    let pattern = atomize_to_string_required(args.pop().unwrap())?;
144
145    // Get input (first argument)
146    let input = atomize_to_string(args.pop().unwrap())?;
147
148    // If input is empty, return empty sequence
149    if input.is_empty() {
150        return Ok(XPathValue::Empty);
151    }
152
153    // Build the regex
154    let regex = build_regex(&pattern, flags.as_deref().unwrap_or(""))?;
155
156    // regexml handles FORX0003 (zero-length match) internally
157    let token_iter = regex.tokenize(&input).map_err(|e| match e {
158        regexml::Error::MatchesEmptyString => XPathError::regex_matches_zero_length(&pattern),
159        _ => XPathError::invalid_regex_pattern(&pattern),
160    })?;
161
162    // Convert to XPathValue sequence, filtering out empty tokens
163    let items: Vec<XmlItem<N>> = token_iter
164        .filter(|s| !s.is_empty())
165        .map(|s| XmlItem::Atomic(XmlValue::string(&s)))
166        .collect();
167
168    Ok(XPathValue::from_sequence(items))
169}
170
171// ============================================================================
172// Helper Functions
173// ============================================================================
174
175/// Build a Regex from an XPath pattern and flags using regexml.
176///
177/// regexml natively handles XML Schema regex syntax including:
178/// - Character class subtraction `[A-Z-[OI]]`
179/// - XSD-specific escapes `\i`, `\c`, `\I`, `\C`
180/// - Unicode categories `\p{Lu}`, `\P{Lu}`
181/// - Flag handling (s, m, i, x)
182fn build_regex(pattern: &str, flags: &str) -> Result<Regex, XPathError> {
183    Regex::xpath(pattern, flags).map_err(|e| match e {
184        regexml::Error::InvalidFlags(_) => XPathError::invalid_regex_flags(flags),
185        regexml::Error::Syntax(_) => XPathError::invalid_regex_pattern(pattern),
186        _ => XPathError::invalid_regex_pattern(pattern),
187    })
188}
189
190#[cfg(test)]
191mod tests {
192    use super::*;
193    use crate::namespace::table::NameTable;
194    use crate::xpath::context::XPathContext;
195    use crate::xpath::RoXmlNavigator;
196
197    fn create_context<'a>(names: &'a NameTable) -> DynamicContext<'a, RoXmlNavigator<'a>> {
198        let static_ctx = XPathContext::new(names);
199        let static_ctx = Box::leak(Box::new(static_ctx));
200        DynamicContext::new(static_ctx, 0)
201    }
202
203    #[test]
204    fn test_matches_basic() {
205        let names = NameTable::new();
206        let mut ctx = create_context(&names);
207
208        let result = matches(
209            &mut ctx,
210            vec![XPathValue::string("abracadabra"), XPathValue::string("bra")],
211        )
212        .unwrap();
213
214        assert!(
215            matches!(result, XPathValue::Item(XmlItem::Atomic(v)) if v.as_boolean() == Some(true))
216        );
217    }
218
219    #[test]
220    fn test_matches_no_match() {
221        let names = NameTable::new();
222        let mut ctx = create_context(&names);
223
224        let result = matches(
225            &mut ctx,
226            vec![XPathValue::string("abracadabra"), XPathValue::string("xyz")],
227        )
228        .unwrap();
229
230        assert!(
231            matches!(result, XPathValue::Item(XmlItem::Atomic(v)) if v.as_boolean() == Some(false))
232        );
233    }
234
235    #[test]
236    fn test_matches_case_insensitive() {
237        let names = NameTable::new();
238        let mut ctx = create_context(&names);
239
240        let result = matches(
241            &mut ctx,
242            vec![
243                XPathValue::string("HELLO"),
244                XPathValue::string("hello"),
245                XPathValue::string("i"),
246            ],
247        )
248        .unwrap();
249
250        assert!(
251            matches!(result, XPathValue::Item(XmlItem::Atomic(v)) if v.as_boolean() == Some(true))
252        );
253    }
254
255    #[test]
256    fn test_matches_multiline() {
257        let names = NameTable::new();
258        let mut ctx = create_context(&names);
259
260        let result = matches(
261            &mut ctx,
262            vec![
263                XPathValue::string("line1\nline2"),
264                XPathValue::string("^line2"),
265                XPathValue::string("m"),
266            ],
267        )
268        .unwrap();
269
270        assert!(
271            matches!(result, XPathValue::Item(XmlItem::Atomic(v)) if v.as_boolean() == Some(true))
272        );
273    }
274
275    #[test]
276    fn test_matches_multiline_empty_line_trailing_newline() {
277        let names = NameTable::new();
278        let mut ctx = create_context(&names);
279
280        let result = matches(
281            &mut ctx,
282            vec![
283                XPathValue::string("abcd\ndefg\n"),
284                XPathValue::string("^$"),
285                XPathValue::string("m"),
286            ],
287        )
288        .unwrap();
289
290        assert!(
291            matches!(result, XPathValue::Item(XmlItem::Atomic(v)) if v.as_boolean() == Some(false))
292        );
293    }
294
295    #[test]
296    fn test_matches_multiline_empty_line_in_middle() {
297        let names = NameTable::new();
298        let mut ctx = create_context(&names);
299
300        let result = matches(
301            &mut ctx,
302            vec![
303                XPathValue::string("abcd\n\ndefg\n"),
304                XPathValue::string("^$"),
305                XPathValue::string("m"),
306            ],
307        )
308        .unwrap();
309
310        assert!(
311            matches!(result, XPathValue::Item(XmlItem::Atomic(v)) if v.as_boolean() == Some(true))
312        );
313    }
314
315    #[test]
316    fn test_matches_class_subtraction_with_i_flag() {
317        let names = NameTable::new();
318        let mut ctx = create_context(&names);
319
320        let match_x = matches(
321            &mut ctx,
322            vec![
323                XPathValue::string("X"),
324                XPathValue::string("[A-Z-[OI]]"),
325                XPathValue::string("i"),
326            ],
327        )
328        .unwrap();
329        assert!(
330            matches!(match_x, XPathValue::Item(XmlItem::Atomic(v)) if v.as_boolean() == Some(true))
331        );
332
333        let match_o = matches(
334            &mut ctx,
335            vec![
336                XPathValue::string("O"),
337                XPathValue::string("[A-Z-[OI]]"),
338                XPathValue::string("i"),
339            ],
340        )
341        .unwrap();
342        assert!(
343            matches!(match_o, XPathValue::Item(XmlItem::Atomic(v)) if v.as_boolean() == Some(false))
344        );
345
346        let match_i = matches(
347            &mut ctx,
348            vec![
349                XPathValue::string("i"),
350                XPathValue::string("[A-Z-[OI]]"),
351                XPathValue::string("i"),
352            ],
353        )
354        .unwrap();
355        assert!(
356            matches!(match_i, XPathValue::Item(XmlItem::Atomic(v)) if v.as_boolean() == Some(false))
357        );
358    }
359
360    #[test]
361    fn test_matches_unicode_categories_with_i_flag() {
362        let names = NameTable::new();
363        let mut ctx = create_context(&names);
364
365        let upper = matches(
366            &mut ctx,
367            vec![
368                XPathValue::string("m"),
369                XPathValue::string(r"\p{Lu}"),
370                XPathValue::string("i"),
371            ],
372        )
373        .unwrap();
374        assert!(
375            matches!(upper, XPathValue::Item(XmlItem::Atomic(v)) if v.as_boolean() == Some(false))
376        );
377
378        let not_upper = matches(
379            &mut ctx,
380            vec![
381                XPathValue::string("m"),
382                XPathValue::string(r"\P{Lu}"),
383                XPathValue::string("i"),
384            ],
385        )
386        .unwrap();
387        assert!(
388            matches!(not_upper, XPathValue::Item(XmlItem::Atomic(v)) if v.as_boolean() == Some(true))
389        );
390    }
391
392    #[test]
393    fn test_matches_invalid_flags() {
394        let names = NameTable::new();
395        let mut ctx = create_context(&names);
396
397        let result = matches(
398            &mut ctx,
399            vec![
400                XPathValue::string("test"),
401                XPathValue::string("test"),
402                XPathValue::string("z"),
403            ],
404        );
405
406        assert!(matches!(result, Err(XPathError::FORX0001 { .. })));
407    }
408
409    #[test]
410    fn test_matches_invalid_pattern() {
411        let names = NameTable::new();
412        let mut ctx = create_context(&names);
413
414        let result = matches(
415            &mut ctx,
416            vec![XPathValue::string("test"), XPathValue::string("[invalid")],
417        );
418
419        assert!(matches!(result, Err(XPathError::FORX0002 { .. })));
420    }
421
422    #[test]
423    fn test_replace_basic() {
424        let names = NameTable::new();
425        let mut ctx = create_context(&names);
426
427        let result = replace(
428            &mut ctx,
429            vec![
430                XPathValue::string("abracadabra"),
431                XPathValue::string("a"),
432                XPathValue::string("X"),
433            ],
434        )
435        .unwrap();
436
437        if let XPathValue::Item(XmlItem::Atomic(v)) = result {
438            assert_eq!(v.as_string(), Some("XbrXcXdXbrX"));
439        } else {
440            panic!("Expected string");
441        }
442    }
443
444    #[test]
445    fn test_replace_with_groups() {
446        let names = NameTable::new();
447        let mut ctx = create_context(&names);
448
449        let result = replace(
450            &mut ctx,
451            vec![
452                XPathValue::string("hello world"),
453                XPathValue::string("([a-z]+) ([a-z]+)"),
454                XPathValue::string("$2 $1"),
455            ],
456        )
457        .unwrap();
458
459        if let XPathValue::Item(XmlItem::Atomic(v)) = result {
460            assert_eq!(v.as_string(), Some("world hello"));
461        } else {
462            panic!("Expected string");
463        }
464    }
465
466    #[test]
467    fn test_replace_zero_length_match() {
468        let names = NameTable::new();
469        let mut ctx = create_context(&names);
470
471        let result = replace(
472            &mut ctx,
473            vec![
474                XPathValue::string("test"),
475                XPathValue::string("a?"),
476                XPathValue::string("X"),
477            ],
478        );
479
480        assert!(matches!(result, Err(XPathError::FORX0003 { .. })));
481    }
482
483    #[test]
484    fn test_replace_invalid_replacement() {
485        let names = NameTable::new();
486        let mut ctx = create_context(&names);
487
488        // $ not followed by digit or $
489        let result = replace(
490            &mut ctx,
491            vec![
492                XPathValue::string("test"),
493                XPathValue::string("t"),
494                XPathValue::string("$x"),
495            ],
496        );
497
498        assert!(matches!(result, Err(XPathError::FORX0004 { .. })));
499    }
500
501    #[test]
502    fn test_tokenize_basic() {
503        let names = NameTable::new();
504        let mut ctx = create_context(&names);
505
506        let result = tokenize(
507            &mut ctx,
508            vec![XPathValue::string("a,b,c"), XPathValue::string(",")],
509        )
510        .unwrap();
511
512        match result {
513            XPathValue::Sequence(items) => {
514                assert_eq!(items.len(), 3);
515                let strs: Vec<String> = items
516                    .iter()
517                    .map(|item| {
518                        if let XmlItem::Atomic(v) = item {
519                            v.to_string_value()
520                        } else {
521                            panic!("Expected atomic")
522                        }
523                    })
524                    .collect();
525                assert_eq!(strs, vec!["a", "b", "c"]);
526            }
527            _ => panic!("Expected sequence"),
528        }
529    }
530
531    #[test]
532    fn test_tokenize_whitespace() {
533        let names = NameTable::new();
534        let mut ctx = create_context(&names);
535
536        let result = tokenize(
537            &mut ctx,
538            vec![
539                XPathValue::string("red   green   blue"),
540                XPathValue::string("\\s+"),
541            ],
542        )
543        .unwrap();
544
545        match result {
546            XPathValue::Sequence(items) => {
547                assert_eq!(items.len(), 3);
548            }
549            _ => panic!("Expected sequence"),
550        }
551    }
552
553    #[test]
554    fn test_tokenize_empty_input() {
555        let names = NameTable::new();
556        let mut ctx = create_context(&names);
557
558        let result = tokenize(
559            &mut ctx,
560            vec![XPathValue::string(""), XPathValue::string(",")],
561        )
562        .unwrap();
563
564        assert!(matches!(result, XPathValue::Empty));
565    }
566
567    #[test]
568    fn test_tokenize_filters_empty_tokens() {
569        // Test that tokenize filters out empty tokens from leading/trailing delimiters
570        let names = NameTable::new();
571        let mut ctx = create_context(&names);
572
573        // Leading delimiter - should not produce empty token at start
574        let result = tokenize(
575            &mut ctx,
576            vec![XPathValue::string(",a,b"), XPathValue::string(",")],
577        )
578        .unwrap();
579
580        match result {
581            XPathValue::Sequence(items) => {
582                assert_eq!(items.len(), 2); // "a" and "b" only, no leading empty
583                let strs: Vec<String> = items
584                    .iter()
585                    .map(|item| {
586                        if let XmlItem::Atomic(v) = item {
587                            v.to_string_value()
588                        } else {
589                            panic!("Expected atomic")
590                        }
591                    })
592                    .collect();
593                assert_eq!(strs, vec!["a", "b"]);
594            }
595            _ => panic!("Expected sequence"),
596        }
597    }
598
599    #[test]
600    fn test_tokenize_trailing_delimiter() {
601        // Trailing delimiter - should not produce empty token at end
602        let names = NameTable::new();
603        let mut ctx = create_context(&names);
604
605        let result = tokenize(
606            &mut ctx,
607            vec![XPathValue::string("a,b,"), XPathValue::string(",")],
608        )
609        .unwrap();
610
611        match result {
612            XPathValue::Sequence(items) => {
613                assert_eq!(items.len(), 2); // "a" and "b" only, no trailing empty
614            }
615            _ => panic!("Expected sequence"),
616        }
617    }
618
619    // =========================================================================
620    // XSD/XPath character class escape tests (\i, \c)
621    // =========================================================================
622
623    #[test]
624    fn test_matches_initial_name_char() {
625        // Test \i matches initial XML name characters
626        let names = NameTable::new();
627        let mut ctx = create_context(&names);
628        let result = matches(
629            &mut ctx,
630            vec![XPathValue::string("_foo"), XPathValue::string(r"\i")],
631        )
632        .unwrap();
633        assert!(
634            matches!(result, XPathValue::Item(XmlItem::Atomic(v)) if v.as_boolean() == Some(true))
635        );
636    }
637
638    #[test]
639    fn test_matches_xml_name_pattern() {
640        // Test \i\c* matches XML names
641        let names = NameTable::new();
642        let mut ctx = create_context(&names);
643        let result = matches(
644            &mut ctx,
645            vec![XPathValue::string("foo:bar"), XPathValue::string(r"\i\c*")],
646        )
647        .unwrap();
648        assert!(
649            matches!(result, XPathValue::Item(XmlItem::Atomic(v)) if v.as_boolean() == Some(true))
650        );
651    }
652
653    #[test]
654    fn test_matches_digit_not_initial() {
655        // Test \i does NOT match digits
656        let names = NameTable::new();
657        let mut ctx = create_context(&names);
658        let result = matches(
659            &mut ctx,
660            vec![XPathValue::string("123"), XPathValue::string(r"^\i")],
661        )
662        .unwrap();
663        assert!(
664            matches!(result, XPathValue::Item(XmlItem::Atomic(v)) if v.as_boolean() == Some(false))
665        );
666    }
667
668    #[test]
669    fn test_matches_name_char_with_digits() {
670        // Test \c matches digits and other name characters
671        let names = NameTable::new();
672        let mut ctx = create_context(&names);
673        let result = matches(
674            &mut ctx,
675            vec![XPathValue::string("abc123"), XPathValue::string(r"\c+")],
676        )
677        .unwrap();
678        assert!(
679            matches!(result, XPathValue::Item(XmlItem::Atomic(v)) if v.as_boolean() == Some(true))
680        );
681    }
682
683    #[test]
684    fn test_replace_with_name_char_pattern() {
685        // Test replace with \c pattern
686        let names = NameTable::new();
687        let mut ctx = create_context(&names);
688        let result = replace(
689            &mut ctx,
690            vec![
691                XPathValue::string("hello world"),
692                XPathValue::string(r"\c+"),
693                XPathValue::string("X"),
694            ],
695        )
696        .unwrap();
697
698        if let XPathValue::Item(XmlItem::Atomic(v)) = result {
699            assert_eq!(v.as_string(), Some("X X"));
700        } else {
701            panic!("Expected string");
702        }
703    }
704
705    #[test]
706    fn test_tokenize_with_non_name_char() {
707        // Test tokenize using \C (non-name character) as delimiter
708        let names = NameTable::new();
709        let mut ctx = create_context(&names);
710        let result = tokenize(
711            &mut ctx,
712            vec![
713                XPathValue::string("foo bar baz"),
714                XPathValue::string(r"\C+"),
715            ],
716        )
717        .unwrap();
718
719        match result {
720            XPathValue::Sequence(items) => {
721                assert_eq!(items.len(), 3);
722                let strs: Vec<String> = items
723                    .iter()
724                    .map(|item| {
725                        if let XmlItem::Atomic(v) = item {
726                            v.to_string_value()
727                        } else {
728                            panic!("Expected atomic")
729                        }
730                    })
731                    .collect();
732                assert_eq!(strs, vec!["foo", "bar", "baz"]);
733            }
734            _ => panic!("Expected sequence"),
735        }
736    }
737}