xsd_schema/
regex_convert.rs

1//! XML Schema / XPath 2.0 regex pattern conversion.
2//!
3//! This module provides shared regex pattern conversion for both XSD pattern facets
4//! and XPath 2.0 regex functions. XPath 2.0 and XSD use the same regex dialect
5//! (XML Schema regex), which differs from standard regex in several ways:
6//!
7//! - XSD-specific character class escapes: `\i`, `\I`, `\c`, `\C`
8//! - XSD patterns are implicitly anchored (must match entire string)
9//! - XPath regex functions do not anchor patterns
10//!
11//! For XSD 1.0 patterns, category escapes `\p{X}` / `\P{X}` are expanded to
12//! explicit Unicode-3.0 ranges before being handed to the underlying regex
13//! engine. This pins the interpretation of `\p{Lu}` etc. to the Unicode version
14//! the MS `msData/regex/reJ*` conformance tests were authored against. See
15//! `regex_xsd_unicode` for the motivation.
16
17use crate::regex_xsd_unicode::{
18    expand_xsd_category_body, xsd10_non_digit_neg_body, xsd10_non_word_char_body,
19    xsd10_private_use_block_body, xsd10_word_char_body,
20};
21use crate::schema::model::XsdVersion;
22
23/// Options for pattern conversion.
24#[derive(Debug, Clone, Copy)]
25pub struct ConvertOptions {
26    /// Whether to anchor the pattern with `^...$` (XSD = true, XPath = false)
27    pub anchor: bool,
28    /// XSD version — selects `\p{X}` lowering. `V1_0` expands recognized
29    /// general-category names to Unicode-3.0 ranges; `V1_1` passes through.
30    pub xsd_version: XsdVersion,
31}
32
33impl Default for ConvertOptions {
34    fn default() -> Self {
35        Self {
36            anchor: false,
37            xsd_version: XsdVersion::V1_1,
38        }
39    }
40}
41
42impl ConvertOptions {
43    /// Create options for XSD 1.1 pattern facets (anchored, modern Unicode).
44    pub fn xsd() -> Self {
45        Self {
46            anchor: true,
47            xsd_version: XsdVersion::V1_1,
48        }
49    }
50
51    /// Create options for XSD 1.0 pattern facets (anchored, Unicode-3.0 pin).
52    pub fn xsd_v1_0() -> Self {
53        Self {
54            anchor: true,
55            xsd_version: XsdVersion::V1_0,
56        }
57    }
58
59    /// Create options for XPath regex functions (unanchored, modern Unicode).
60    pub fn xpath() -> Self {
61        Self {
62            anchor: false,
63            xsd_version: XsdVersion::V1_1,
64        }
65    }
66}
67
68/// Apply MS dialect leniencies to a pattern when the schema set is
69/// configured with [`RegexCompat::LenientMs`].
70///
71/// The textual preprocess is intentionally narrow. It only rewrites
72/// constructs that *no* runtime backend accepts natively, so they would
73/// otherwise fail at compile time even with the strict §F/§G grammar
74/// gate skipped:
75///
76/// - `(?#…)` inline comments. Stripped (including the closing `)`).
77///   Both Rust `regex` and regexml reject `(?#` as an unrecognized
78///   group prefix. .NET treats it as a comment per its native syntax.
79///
80/// Other MS dialect constructs (`^`/`$` anchors outside char class,
81/// non-capturing `(?:…)`, backreferences `\1`, reluctant quantifiers
82/// `*?`/`+?`) are left alone — the runtime backend handles them
83/// natively once the strict grammar gate is bypassed:
84///
85/// - Rust `regex` (default features) natively accepts `^`/`$` as
86///   anchors, `(?:…)`, named groups, reluctant quantifiers; it does
87///   *not* support backreferences or lookaround.
88/// - regexml `xpath()` (xsd11 feature) natively accepts `^`/`$`,
89///   backreferences (`op_back_reference.rs`), `(?:…)`, reluctant
90///   quantifiers; it does not implement lookaround at all.
91///
92/// Constructs neither backend supports (lookahead `(?=…)`, lookbehind
93/// `(?<=…)`) still fail at compile time even under `LenientMs` — that
94/// is an engine limit, not a grammar choice.
95///
96/// Returns the (possibly rewritten) pattern. When [`RegexCompat::Strict`]
97/// is in effect, callers should not invoke this.
98pub fn lenient_ms_preprocess(pattern: &str) -> std::borrow::Cow<'_, str> {
99    if !pattern.contains("(?#") {
100        return std::borrow::Cow::Borrowed(pattern);
101    }
102    std::borrow::Cow::Owned(strip_inline_comments(pattern))
103}
104
105/// Strip `(?#…)` comments. Skips comment-like sequences inside character
106/// classes (where `(` has no special meaning). A `\` escapes the following
107/// character so `\(?#x)` is preserved.
108fn strip_inline_comments(pattern: &str) -> String {
109    let mut out = String::with_capacity(pattern.len());
110    let mut in_class = false;
111    let mut chars = pattern.char_indices().peekable();
112    while let Some((idx, ch)) = chars.next() {
113        if ch == '\\' {
114            out.push(ch);
115            if let Some((_, next)) = chars.next() {
116                out.push(next);
117            }
118            continue;
119        }
120        if ch == '[' {
121            in_class = true;
122            out.push(ch);
123            continue;
124        }
125        if ch == ']' {
126            in_class = false;
127            out.push(ch);
128            continue;
129        }
130        if !in_class && ch == '(' && pattern[idx..].starts_with("(?#") {
131            // Skip past matching `)`. Comments cannot be nested per
132            // .NET / PCRE conventions, but we still respect `\)`.
133            let after = idx + "(?#".len();
134            let remainder = &pattern[after..];
135            let mut close = None;
136            let mut j = 0;
137            let rb = remainder.as_bytes();
138            while j < rb.len() {
139                if rb[j] == b'\\' && j + 1 < rb.len() {
140                    j += 2;
141                    continue;
142                }
143                if rb[j] == b')' {
144                    close = Some(j);
145                    break;
146                }
147                j += 1;
148            }
149            if let Some(c) = close {
150                let consume_to = after + c + 1;
151                while let Some(&(next_idx, _)) = chars.peek() {
152                    if next_idx < consume_to {
153                        chars.next();
154                    } else {
155                        break;
156                    }
157                }
158                continue;
159            }
160            // Unterminated `(?#` — fall through and emit literally.
161        }
162        out.push(ch);
163    }
164    out
165}
166
167/// Convert XSD/XPath regex pattern to Rust regex syntax.
168///
169/// Handles XSD-specific character class escapes:
170/// - `\i` -> `[A-Za-z_:]` (XML initial name character)
171/// - `\I` -> `[^A-Za-z_:]` (not initial name character)
172/// - `\c` -> `[A-Za-z0-9._:\-]` (XML name character)
173/// - `\C` -> `[^A-Za-z0-9._:\-]` (not name character)
174///
175/// Under `XsdVersion::V1_0`, category escapes `\p{X}` and `\P{X}` for
176/// recognized general-category names are expanded to Unicode-3.0 ranges;
177/// block escapes `\p{Is...}` and unknown names are passed through.
178///
179/// # Arguments
180/// - `pattern`: The XSD/XPath regex pattern
181/// - `options`: Conversion options (anchoring, XSD version)
182///
183/// # Returns
184/// A regex pattern string compatible with both the `regex` crate and `regexml`.
185pub fn convert_xml_pattern(pattern: &str, options: ConvertOptions) -> String {
186    let extra_capacity = if options.anchor { 4 } else { 0 };
187    // Under V1_0, `\d` / `\D` / `\w` / `\W` and `\p{X}` expand to multi-KB
188    // explicit ranges; over-allocate to avoid repeated reallocations
189    // (mirrors `rewrite_xsd10_category_escapes` at line 184).
190    let initial_capacity = match options.xsd_version {
191        XsdVersion::V1_0 => pattern.len() * 4 + extra_capacity,
192        XsdVersion::V1_1 => pattern.len() + extra_capacity,
193    };
194    let mut result = String::with_capacity(initial_capacity);
195
196    if options.anchor {
197        result.push('^');
198    }
199
200    let mut in_class = false;
201    let mut chars = pattern.chars().peekable();
202    while let Some(ch) = chars.next() {
203        if ch == '\\' {
204            let Some(&next) = chars.peek() else {
205                result.push('\\');
206                continue;
207            };
208            match next {
209                // XSD-specific character class escapes
210                'i' => {
211                    chars.next();
212                    result.push_str(r"[A-Za-z_:]");
213                }
214                'I' => {
215                    chars.next();
216                    result.push_str(r"[^A-Za-z_:]");
217                }
218                'c' => {
219                    chars.next();
220                    result.push_str(r"[A-Za-z0-9._:\-]");
221                }
222                'C' => {
223                    chars.next();
224                    result.push_str(r"[^A-Za-z0-9._:\-]");
225                }
226                // XSD 1.0 multi-character class escapes \d, \D, \w, \W —
227                // expand to explicit Unicode-3.0 ranges so the regex engine
228                // (which uses modern Unicode for \d / \w / \D / \W) cannot
229                // disagree with the MS reS/reT/reU test expectations. Inside
230                // a character class only the positive forms expand inline;
231                // the negated forms fall through to the engine's native
232                // escape (set complementation isn't expressible inline).
233                'd' | 'D' | 'w' | 'W'
234                    if options.xsd_version == XsdVersion::V1_0
235                        && expand_xsd10_class_escape(&mut result, next, in_class) =>
236                {
237                    chars.next();
238                }
239                // Standard escapes - pass through
240                'd' | 'D' | 's' | 'S' | 'w' | 'W' | 'n' | 'r' | 't' | '\\' | '|' | '.' | '?'
241                | '*' | '+' | '{' | '}' | '(' | ')' | '[' | ']' | '^' | '$' | '-' => {
242                    result.push('\\');
243                    result.push(next);
244                    chars.next();
245                }
246                // Unicode category escapes \p{...} / \P{...}
247                'p' | 'P' => {
248                    let negated = next == 'P';
249                    chars.next();
250                    handle_category_escape(
251                        &mut result,
252                        &mut chars,
253                        negated,
254                        in_class,
255                        options.xsd_version == XsdVersion::V1_0,
256                    );
257                }
258                // Other escapes - pass through
259                _ => {
260                    result.push('\\');
261                    result.push(next);
262                    chars.next();
263                }
264            }
265        } else {
266            if ch == '[' {
267                in_class = true;
268            } else if ch == ']' {
269                in_class = false;
270            }
271            result.push(ch);
272        }
273    }
274
275    if options.anchor {
276        result.push('$');
277    }
278    result
279}
280
281/// Rewrite XSD 1.0 general-category escapes `\p{X}` / `\P{X}` to explicit
282/// Unicode 3.0 range classes, leaving every other character (including
283/// `\i`, `\I`, `\c`, `\C`, standard escapes, and nested character classes)
284/// untouched.
285///
286/// Intended for the `xsd11` feature path, where regexml handles all other
287/// XSD regex constructs natively but we still need to pin category-escape
288/// semantics to Unicode 3.0 for XSD 1.0 patterns. Block escapes (`Is...`)
289/// and unknown category names pass through unchanged.
290pub fn rewrite_xsd10_category_escapes(pattern: &str) -> String {
291    let mut result = String::with_capacity(pattern.len() * 4);
292    let mut in_class = false;
293    let mut chars = pattern.chars().peekable();
294    while let Some(ch) = chars.next() {
295        if ch != '\\' {
296            if ch == '[' {
297                in_class = true;
298            } else if ch == ']' {
299                in_class = false;
300            }
301            result.push(ch);
302            continue;
303        }
304        let Some(&next) = chars.peek() else {
305            result.push('\\');
306            continue;
307        };
308        if matches!(next, 'd' | 'D' | 'w' | 'W')
309            && expand_xsd10_class_escape(&mut result, next, in_class)
310        {
311            chars.next();
312            continue;
313        }
314        if next != 'p' && next != 'P' {
315            result.push('\\');
316            result.push(next);
317            chars.next();
318            continue;
319        }
320        let negated = next == 'P';
321        chars.next();
322        handle_category_escape(&mut result, &mut chars, negated, in_class, true);
323    }
324    result
325}
326
327/// Expand the XSD 1.0 multi-character class escapes `\d`, `\D`, `\w`, `\W`
328/// to explicit Unicode-3.0 ranges. Returns `true` if the expansion was
329/// emitted; `false` means the caller should fall back to passing the escape
330/// through verbatim.
331///
332/// All four expansions are BMP-bounded, matching MS test expectations
333/// authored against pre-Unicode-3.1 / UTF-16-unit semantics:
334///   - `\d` → `[<Nd>]` (positive, BMP)
335///   - `\D` → `[^<Nd>U+10000-U+10FFFD]` (negation excludes supplementary plane)
336///   - `\w` → `[<L+M+N+S>]` (positive, BMP — excludes Cn / supplementary)
337///   - `\W` → `[<P+Z+C>]` (positive, BMP — excludes supplementary)
338///
339/// Inside a character class only `\d` and `\w` expand inline (their bodies
340/// merge cleanly into the surrounding class); `\D` / `\W` would need set
341/// complementation, so they are passed through to the engine in that
342/// position.
343fn expand_xsd10_class_escape(out: &mut String, escape: char, in_class: bool) -> bool {
344    let (body, negated): (&str, bool) = match escape {
345        'd' => (expand_xsd_category_body("Nd").unwrap_or(""), false),
346        'D' => (xsd10_non_digit_neg_body(), true),
347        'w' => (xsd10_word_char_body(), false),
348        'W' => (xsd10_non_word_char_body(), false),
349        _ => return false,
350    };
351    if body.is_empty() {
352        return false;
353    }
354    if in_class {
355        if negated {
356            return false;
357        }
358        out.push_str(body);
359        return true;
360    }
361    if negated {
362        out.push_str("[^");
363    } else {
364        out.push('[');
365    }
366    out.push_str(body);
367    out.push(']');
368    true
369}
370
371/// Validate XSD 1.0 regex character-class hyphen rules — stricter than the backend
372/// parsers and stricter than XSD 1.1.
373///
374/// Per XSD 1.0 Datatypes §F (regex grammar productions [14]–[22]) under longest-match
375/// disambiguation, an unescaped `-` inside a character class must be (a) the first
376/// atom (immediately after `[` or `[^`), (b) the last atom (immediately before `]`),
377/// (c) the middle character of an `seRange` (e.g. `a-z`), or (d) the subtraction
378/// operator separating a `posCharGroup` from a nested `charClassExpr` (`...-[...]`).
379/// Any other position — e.g. `[a-c-1]`, `[^a-d-b-c]`, `[a-z-+]`, `[--z]` — is
380/// ambiguous and a syntax error in XSD 1.0. XSD 1.1 (Datatypes 1.1 §G) relaxed
381/// these rules, allowing literal hyphens elsewhere via `XmlCharIncDash`, so this
382/// validator must only be invoked for XSD 1.0.
383pub fn validate_xml_pattern_syntax(pattern: &str) -> Result<(), String> {
384    let chars: Vec<char> = pattern.chars().collect();
385    let mut index = 0;
386    while index < chars.len() {
387        match chars[index] {
388            '\\' => index = skip_escape(&chars, index + 1),
389            '[' => index = validate_char_class(&chars, index + 1)?,
390            _ => index += 1,
391        }
392    }
393    Ok(())
394}
395
396#[derive(Clone, Copy)]
397struct ClassAtom {
398    available_for_range: bool,
399    unescaped_hyphen: bool,
400}
401
402fn validate_char_class(chars: &[char], mut index: usize) -> Result<usize, String> {
403    let mut prev_atom: Option<ClassAtom> = None;
404    let mut at_group_start = true;
405    let mut allow_nested_class = false;
406
407    if chars.get(index) == Some(&'^') {
408        index += 1;
409    }
410
411    while index < chars.len() {
412        match chars[index] {
413            '\\' => {
414                let (is_single_char, next_index) = consume_class_escape(chars, index + 1);
415                prev_atom = Some(ClassAtom {
416                    available_for_range: is_single_char,
417                    unescaped_hyphen: false,
418                });
419                at_group_start = false;
420                allow_nested_class = false;
421                index = next_index;
422            }
423            '[' => {
424                if !allow_nested_class {
425                    return Err("unescaped '[' in character class".to_string());
426                }
427                index = validate_char_class(chars, index + 1)?;
428                prev_atom = Some(ClassAtom {
429                    available_for_range: false,
430                    unescaped_hyphen: false,
431                });
432                at_group_start = false;
433                allow_nested_class = false;
434            }
435            ']' => return Ok(index + 1),
436            '-' => {
437                let next = chars.get(index + 1).copied();
438                let next_after = chars.get(index + 2).copied();
439
440                if next == Some('[') {
441                    allow_nested_class = true;
442                    prev_atom = None;
443                    at_group_start = false;
444                    index += 1;
445                    continue;
446                }
447
448                if at_group_start
449                    || next == Some(']')
450                    || (next == Some('-') && next_after == Some('['))
451                {
452                    prev_atom = Some(ClassAtom {
453                        available_for_range: true,
454                        unescaped_hyphen: true,
455                    });
456                    at_group_start = false;
457                    allow_nested_class = false;
458                    index += 1;
459                    continue;
460                }
461
462                let Some(prev) = prev_atom else {
463                    return Err("hyphen is not a valid character range operator".to_string());
464                };
465                if !prev.available_for_range || prev.unescaped_hyphen {
466                    return Err("hyphen is not a valid character range operator".to_string());
467                }
468
469                let Some((range_end, next_index)) = peek_single_class_atom(chars, index + 1) else {
470                    return Err("hyphen is not followed by a valid range endpoint".to_string());
471                };
472                if range_end.unescaped_hyphen {
473                    return Err("unescaped hyphen cannot be a character range endpoint".to_string());
474                }
475
476                prev_atom = Some(ClassAtom {
477                    available_for_range: false,
478                    unescaped_hyphen: false,
479                });
480                at_group_start = false;
481                allow_nested_class = false;
482                index = next_index;
483            }
484            _ => {
485                prev_atom = Some(ClassAtom {
486                    available_for_range: true,
487                    unescaped_hyphen: false,
488                });
489                at_group_start = false;
490                allow_nested_class = false;
491                index += 1;
492            }
493        }
494    }
495
496    Err("unterminated character class".to_string())
497}
498
499fn skip_escape(chars: &[char], index: usize) -> usize {
500    if matches!(chars.get(index), Some('p' | 'P')) && chars.get(index + 1) == Some(&'{') {
501        let mut cursor = index + 2;
502        while cursor < chars.len() {
503            if chars[cursor] == '}' {
504                return cursor + 1;
505            }
506            cursor += 1;
507        }
508        return cursor;
509    }
510    index.saturating_add(1).min(chars.len())
511}
512
513fn consume_class_escape(chars: &[char], index: usize) -> (bool, usize) {
514    let is_single_char = matches!(
515        chars.get(index),
516        Some(
517            'n' | 'r'
518                | 't'
519                | '\\'
520                | '|'
521                | '.'
522                | '?'
523                | '*'
524                | '+'
525                | '('
526                | ')'
527                | '{'
528                | '}'
529                | '-'
530                | '['
531                | ']'
532                | '^'
533        )
534    );
535    (is_single_char, skip_escape(chars, index))
536}
537
538fn peek_single_class_atom(chars: &[char], index: usize) -> Option<(ClassAtom, usize)> {
539    match chars.get(index).copied()? {
540        '\\' => {
541            let (is_single_char, next_index) = consume_class_escape(chars, index + 1);
542            is_single_char.then_some((
543                ClassAtom {
544                    available_for_range: false,
545                    unescaped_hyphen: false,
546                },
547                next_index,
548            ))
549        }
550        '[' | ']' => None,
551        '-' => Some((
552            ClassAtom {
553                available_for_range: false,
554                unescaped_hyphen: true,
555            },
556            index + 1,
557        )),
558        _ => Some((
559            ClassAtom {
560                available_for_range: false,
561                unescaped_hyphen: false,
562            },
563            index + 1,
564        )),
565    }
566}
567
568/// Look up the XSD 1.0 / Unicode 3.0 char-class body for `\p{name}`,
569/// covering both general-category codes and block names. Returns `None`
570/// for names handled by the engine natively (other `IsX` blocks, unknown
571/// names, Cn/Cs).
572///
573/// `IsPrivateUse` is overridden here because regexml's block lookup
574/// follows the DIS XSD 1.1 backwards-compatibility table and unions the
575/// BMP PUA with the supplementary PUAs (Plane 15 / 16). Those areas did
576/// not exist in Unicode 3.0 and the W3C MS reL/reM/reN tests require them
577/// to be excluded under XSD 1.0.
578fn xsd10_category_or_block_body(name: &str) -> Option<&'static str> {
579    if name == "IsPrivateUse" {
580        return Some(xsd10_private_use_block_body());
581    }
582    expand_xsd_category_body(name)
583}
584
585/// Shared lowering for `\p{X}` / `\P{X}` under XSD 1.0 Unicode-3.0 pinning.
586///
587/// Returns `true` if `name` is a recognized general-category code and the
588/// appropriate expansion was appended to `out`. Returns `false` otherwise
589/// (block escape, unknown name, or a negated escape inside a character class
590/// — which would require set subtraction that isn't expressible here), in
591/// which case the caller is expected to emit the original `\p{...}` /
592/// `\P{...}` tokens verbatim.
593///
594/// - Positive `\p{X}` inside `[...]`: appends just the expanded body (no
595///   nested brackets).
596/// - Positive `\p{X}` outside: wraps the body with `[...]`.
597/// - Negated `\P{X}` outside: wraps with `[^...]`.
598fn try_expand_category(out: &mut String, name: &str, negated: bool, in_class: bool) -> bool {
599    let Some(body) = xsd10_category_or_block_body(name) else {
600        return false;
601    };
602    if in_class {
603        if negated {
604            return false;
605        }
606        out.push_str(body);
607        return true;
608    }
609    if negated {
610        out.push_str("[^");
611    } else {
612        out.push('[');
613    }
614    out.push_str(body);
615    out.push(']');
616    true
617}
618
619/// Parse `{name}` (the body of a `\p{…}` / `\P{…}` escape) and append either
620/// the expanded character class (when `try_expand` is true and `name` is a
621/// recognized general-category code) or the verbatim original token.
622///
623/// Caller has already consumed `\` and the `p`/`P`; `chars` is positioned
624/// just before the opening `{` (or at a stray `\p`/`\P` if `{` is absent).
625fn handle_category_escape(
626    out: &mut String,
627    chars: &mut std::iter::Peekable<std::str::Chars<'_>>,
628    negated: bool,
629    in_class: bool,
630    try_expand: bool,
631) {
632    let marker = if negated { 'P' } else { 'p' };
633    if chars.peek() != Some(&'{') {
634        out.push('\\');
635        out.push(marker);
636        return;
637    }
638    chars.next();
639    let mut name = String::new();
640    let mut closed = false;
641    for c in chars.by_ref() {
642        if c == '}' {
643            closed = true;
644            break;
645        }
646        name.push(c);
647    }
648    if try_expand && closed && try_expand_category(out, &name, negated, in_class) {
649        return;
650    }
651    out.push('\\');
652    out.push(marker);
653    out.push('{');
654    out.push_str(&name);
655    if closed {
656        out.push('}');
657    }
658}
659
660#[cfg(test)]
661mod tests {
662    use super::*;
663    use regex::Regex;
664
665    #[test]
666    fn test_initial_name_char_escape() {
667        let result = convert_xml_pattern(r"\i", ConvertOptions::xpath());
668        assert_eq!(result, r"[A-Za-z_:]");
669        let regex = Regex::new(&result).unwrap();
670        assert!(regex.is_match("A"));
671        assert!(regex.is_match("_"));
672        assert!(!regex.is_match("1"));
673    }
674
675    #[test]
676    fn test_not_initial_name_char_escape() {
677        let result = convert_xml_pattern(r"\I", ConvertOptions::xpath());
678        assert_eq!(result, r"[^A-Za-z_:]");
679        let regex = Regex::new(&result).unwrap();
680        assert!(!regex.is_match("A"));
681        assert!(regex.is_match("1"));
682        assert!(regex.is_match(" "));
683    }
684
685    #[test]
686    fn test_name_char_escape() {
687        let result = convert_xml_pattern(r"\c", ConvertOptions::xpath());
688        assert_eq!(result, r"[A-Za-z0-9._:\-]");
689        let regex = Regex::new(&result).unwrap();
690        assert!(regex.is_match("A"));
691        assert!(regex.is_match("1"));
692        assert!(regex.is_match("-"));
693        assert!(!regex.is_match(" "));
694    }
695
696    #[test]
697    fn test_not_name_char_escape() {
698        let result = convert_xml_pattern(r"\C", ConvertOptions::xpath());
699        assert_eq!(result, r"[^A-Za-z0-9._:\-]");
700        let regex = Regex::new(&result).unwrap();
701        assert!(!regex.is_match("A"));
702        assert!(!regex.is_match("1"));
703        assert!(regex.is_match(" "));
704    }
705
706    #[test]
707    fn test_xsd_anchoring() {
708        let result = convert_xml_pattern("abc", ConvertOptions::xsd());
709        assert_eq!(result, "^abc$");
710    }
711
712    #[test]
713    fn test_xpath_no_anchoring() {
714        let result = convert_xml_pattern("abc", ConvertOptions::xpath());
715        assert_eq!(result, "abc");
716    }
717
718    #[test]
719    fn test_xml_name_pattern() {
720        let result = convert_xml_pattern(r"\i\c*", ConvertOptions::xsd());
721        assert_eq!(result, r"^[A-Za-z_:][A-Za-z0-9._:\-]*$");
722        let regex = Regex::new(&result).unwrap();
723        assert!(regex.is_match("foo"));
724        assert!(regex.is_match("foo:bar"));
725        assert!(regex.is_match("_bar"));
726        assert!(!regex.is_match("123"));
727    }
728
729    #[test]
730    fn test_standard_escapes_preserved() {
731        let result = convert_xml_pattern(r"\d+\s*\w+", ConvertOptions::xpath());
732        assert_eq!(result, r"\d+\s*\w+");
733    }
734
735    #[test]
736    fn test_v1_1_preserves_p_escape() {
737        let result = convert_xml_pattern(r"\p{L}\P{N}", ConvertOptions::xpath());
738        assert_eq!(result, r"\p{L}\P{N}");
739    }
740
741    #[test]
742    fn test_v1_0_expands_p_category_escape() {
743        let result = convert_xml_pattern(r"\p{Lu}*", ConvertOptions::xsd_v1_0());
744        assert!(result.starts_with("^["));
745        assert!(result.ends_with("]*$"));
746        assert!(!result.contains("\\p{"));
747        let regex = Regex::new(&result).unwrap();
748        assert!(regex.is_match("A"));
749        assert!(regex.is_match("ABC"));
750        assert!(!regex.is_match("a"));
751        // reJ11 contract: U+1D7A8 is Lu in modern Unicode but
752        // unassigned in Unicode 3.0, so it must NOT match here.
753        let s = format!("A{}", char::from_u32(0x1D7A8).unwrap());
754        assert!(!regex.is_match(&s));
755    }
756
757    #[test]
758    fn test_v1_0_expands_negated_p_category_escape() {
759        let result = convert_xml_pattern(r"\P{N}*", ConvertOptions::xsd_v1_0());
760        assert!(result.contains("[^"));
761        assert!(!result.contains("\\P{"));
762        let regex = Regex::new(&result).unwrap();
763        assert!(regex.is_match("abc"));
764        assert!(!regex.is_match("123"));
765    }
766
767    #[test]
768    fn test_v1_0_passes_through_block_escape() {
769        let result = convert_xml_pattern(r"\p{IsBasicLatin}*", ConvertOptions::xsd_v1_0());
770        // Block escapes are not expanded — left for the regex engine.
771        assert!(result.contains(r"\p{IsBasicLatin}"));
772    }
773
774    #[test]
775    fn test_v1_0_passes_through_unknown_category() {
776        let result = convert_xml_pattern(r"\p{Xx}", ConvertOptions::xsd_v1_0());
777        assert!(result.contains(r"\p{Xx}"));
778    }
779
780    #[test]
781    fn test_mixed_pattern() {
782        let result = convert_xml_pattern(r"\i\c*:\d+", ConvertOptions::xsd());
783        assert_eq!(result, r"^[A-Za-z_:][A-Za-z0-9._:\-]*:\d+$");
784        let regex = Regex::new(&result).unwrap();
785        assert!(regex.is_match("item:123"));
786        assert!(!regex.is_match("123:abc"));
787    }
788
789    #[test]
790    fn test_empty_pattern() {
791        let result = convert_xml_pattern("", ConvertOptions::xsd());
792        assert_eq!(result, "^$");
793
794        let result = convert_xml_pattern("", ConvertOptions::xpath());
795        assert_eq!(result, "");
796    }
797
798    #[test]
799    fn test_trailing_backslash() {
800        let result = convert_xml_pattern(r"abc\", ConvertOptions::xpath());
801        assert_eq!(result, r"abc\");
802    }
803
804    #[test]
805    fn test_rewrite_xsd10_expands_p_but_keeps_name_escapes() {
806        let result = rewrite_xsd10_category_escapes(r"\i\c*\p{Lu}+");
807        assert!(result.starts_with(r"\i\c*["), "unexpected: {}", result);
808        assert!(result.ends_with("]+"), "unexpected: {}", result);
809        assert!(!result.contains(r"\p{"));
810    }
811
812    #[test]
813    fn test_rewrite_xsd10_passes_block_escapes() {
814        let result = rewrite_xsd10_category_escapes(r"\p{IsBasicLatin}+");
815        assert_eq!(result, r"\p{IsBasicLatin}+");
816    }
817
818    #[test]
819    fn test_rewrite_xsd10_passes_unknown_names() {
820        let result = rewrite_xsd10_category_escapes(r"\p{Xx}");
821        assert_eq!(result, r"\p{Xx}");
822    }
823
824    #[test]
825    fn test_rewrite_xsd10_negated_category() {
826        let result = rewrite_xsd10_category_escapes(r"\P{N}+");
827        assert!(result.starts_with("[^"));
828        assert!(result.ends_with("]+"));
829    }
830
831    #[test]
832    fn test_validate_xsd10_character_class_hyphen_rules() {
833        for valid in [
834            r"[a-d]",
835            r"[-a]+",
836            r"[-]",
837            r"[a-]",
838            r"[a-\}-]+",
839            r"[a-z--[b-z]]",
840            r"[a-b-[0-9]]+",
841        ] {
842            assert!(
843                validate_xml_pattern_syntax(valid).is_ok(),
844                "expected valid XSD 1.0 regex: {valid}",
845            );
846        }
847
848        // Invalid forms drawn from W3C msData reF20-23, reG26-33, reH19-21 and
849        // saxonData/Simple/simple045 — each is listed as XSD-1.0-invalid in the
850        // suite manifest, regardless of whether XSD 1.1 accepts the same form.
851        for invalid in [
852            r"[^a-d-b-c]",
853            r"[a-c-1-4x-z-7-9]*",
854            r"[a-a-x-x]+",
855            r"[a-z-+]*",
856            r"[a--b]",
857            r"[--z]",
858        ] {
859            assert!(
860                validate_xml_pattern_syntax(invalid).is_err(),
861                "expected invalid XSD 1.0 regex: {invalid}",
862            );
863        }
864    }
865
866    #[test]
867    fn lenient_ms_strips_inline_comments() {
868        assert_eq!(lenient_ms_preprocess("a(?#note)b"), "ab");
869        assert_eq!(lenient_ms_preprocess("(?#start)abc(?#end)"), "abc");
870    }
871
872    #[test]
873    fn lenient_ms_passthrough_when_clean() {
874        // No `(?#` — should return Borrowed without copying.
875        let p = "^abc[0-9]+$";
876        let result = lenient_ms_preprocess(p);
877        assert!(matches!(result, std::borrow::Cow::Borrowed(_)));
878        assert_eq!(result, p);
879    }
880
881    #[test]
882    fn lenient_ms_keeps_anchors_for_engine() {
883        // Anchors are handled natively by both backends after the
884        // `^(?:...)$` wrapping; preprocess no longer strips them.
885        assert_eq!(lenient_ms_preprocess("^abc$"), "^abc$");
886        assert_eq!(lenient_ms_preprocess("[^abc]"), "[^abc]");
887    }
888}
xsd_schema/regex_convert.rs

xsd_schema/
regex_convert.rs