1use crate::regex_xsd_unicode::{
18 expand_xsd_category_body, xsd10_non_digit_neg_body, xsd10_non_word_char_body,
19 xsd10_private_use_block_body, xsd10_word_char_body,
20};
21use crate::schema::model::XsdVersion;
22
23#[derive(Debug, Clone, Copy)]
25pub struct ConvertOptions {
26 pub anchor: bool,
28 pub xsd_version: XsdVersion,
31}
32
33impl Default for ConvertOptions {
34 fn default() -> Self {
35 Self {
36 anchor: false,
37 xsd_version: XsdVersion::V1_1,
38 }
39 }
40}
41
42impl ConvertOptions {
43 pub fn xsd() -> Self {
45 Self {
46 anchor: true,
47 xsd_version: XsdVersion::V1_1,
48 }
49 }
50
51 pub fn xsd_v1_0() -> Self {
53 Self {
54 anchor: true,
55 xsd_version: XsdVersion::V1_0,
56 }
57 }
58
59 pub fn xpath() -> Self {
61 Self {
62 anchor: false,
63 xsd_version: XsdVersion::V1_1,
64 }
65 }
66}
67
68pub fn lenient_ms_preprocess(pattern: &str) -> std::borrow::Cow<'_, str> {
99 if !pattern.contains("(?#") {
100 return std::borrow::Cow::Borrowed(pattern);
101 }
102 std::borrow::Cow::Owned(strip_inline_comments(pattern))
103}
104
105fn strip_inline_comments(pattern: &str) -> String {
109 let mut out = String::with_capacity(pattern.len());
110 let mut in_class = false;
111 let mut chars = pattern.char_indices().peekable();
112 while let Some((idx, ch)) = chars.next() {
113 if ch == '\\' {
114 out.push(ch);
115 if let Some((_, next)) = chars.next() {
116 out.push(next);
117 }
118 continue;
119 }
120 if ch == '[' {
121 in_class = true;
122 out.push(ch);
123 continue;
124 }
125 if ch == ']' {
126 in_class = false;
127 out.push(ch);
128 continue;
129 }
130 if !in_class && ch == '(' && pattern[idx..].starts_with("(?#") {
131 let after = idx + "(?#".len();
134 let remainder = &pattern[after..];
135 let mut close = None;
136 let mut j = 0;
137 let rb = remainder.as_bytes();
138 while j < rb.len() {
139 if rb[j] == b'\\' && j + 1 < rb.len() {
140 j += 2;
141 continue;
142 }
143 if rb[j] == b')' {
144 close = Some(j);
145 break;
146 }
147 j += 1;
148 }
149 if let Some(c) = close {
150 let consume_to = after + c + 1;
151 while let Some(&(next_idx, _)) = chars.peek() {
152 if next_idx < consume_to {
153 chars.next();
154 } else {
155 break;
156 }
157 }
158 continue;
159 }
160 }
162 out.push(ch);
163 }
164 out
165}
166
167pub fn convert_xml_pattern(pattern: &str, options: ConvertOptions) -> String {
186 let extra_capacity = if options.anchor { 4 } else { 0 };
187 let initial_capacity = match options.xsd_version {
191 XsdVersion::V1_0 => pattern.len() * 4 + extra_capacity,
192 XsdVersion::V1_1 => pattern.len() + extra_capacity,
193 };
194 let mut result = String::with_capacity(initial_capacity);
195
196 if options.anchor {
197 result.push('^');
198 }
199
200 let mut in_class = false;
201 let mut chars = pattern.chars().peekable();
202 while let Some(ch) = chars.next() {
203 if ch == '\\' {
204 let Some(&next) = chars.peek() else {
205 result.push('\\');
206 continue;
207 };
208 match next {
209 'i' => {
211 chars.next();
212 result.push_str(r"[A-Za-z_:]");
213 }
214 'I' => {
215 chars.next();
216 result.push_str(r"[^A-Za-z_:]");
217 }
218 'c' => {
219 chars.next();
220 result.push_str(r"[A-Za-z0-9._:\-]");
221 }
222 'C' => {
223 chars.next();
224 result.push_str(r"[^A-Za-z0-9._:\-]");
225 }
226 'd' | 'D' | 'w' | 'W'
234 if options.xsd_version == XsdVersion::V1_0
235 && expand_xsd10_class_escape(&mut result, next, in_class) =>
236 {
237 chars.next();
238 }
239 'd' | 'D' | 's' | 'S' | 'w' | 'W' | 'n' | 'r' | 't' | '\\' | '|' | '.' | '?'
241 | '*' | '+' | '{' | '}' | '(' | ')' | '[' | ']' | '^' | '$' | '-' => {
242 result.push('\\');
243 result.push(next);
244 chars.next();
245 }
246 'p' | 'P' => {
248 let negated = next == 'P';
249 chars.next();
250 handle_category_escape(
251 &mut result,
252 &mut chars,
253 negated,
254 in_class,
255 options.xsd_version == XsdVersion::V1_0,
256 );
257 }
258 _ => {
260 result.push('\\');
261 result.push(next);
262 chars.next();
263 }
264 }
265 } else {
266 if ch == '[' {
267 in_class = true;
268 } else if ch == ']' {
269 in_class = false;
270 }
271 result.push(ch);
272 }
273 }
274
275 if options.anchor {
276 result.push('$');
277 }
278 result
279}
280
281pub fn rewrite_xsd10_category_escapes(pattern: &str) -> String {
291 let mut result = String::with_capacity(pattern.len() * 4);
292 let mut in_class = false;
293 let mut chars = pattern.chars().peekable();
294 while let Some(ch) = chars.next() {
295 if ch != '\\' {
296 if ch == '[' {
297 in_class = true;
298 } else if ch == ']' {
299 in_class = false;
300 }
301 result.push(ch);
302 continue;
303 }
304 let Some(&next) = chars.peek() else {
305 result.push('\\');
306 continue;
307 };
308 if matches!(next, 'd' | 'D' | 'w' | 'W')
309 && expand_xsd10_class_escape(&mut result, next, in_class)
310 {
311 chars.next();
312 continue;
313 }
314 if next != 'p' && next != 'P' {
315 result.push('\\');
316 result.push(next);
317 chars.next();
318 continue;
319 }
320 let negated = next == 'P';
321 chars.next();
322 handle_category_escape(&mut result, &mut chars, negated, in_class, true);
323 }
324 result
325}
326
327fn expand_xsd10_class_escape(out: &mut String, escape: char, in_class: bool) -> bool {
344 let (body, negated): (&str, bool) = match escape {
345 'd' => (expand_xsd_category_body("Nd").unwrap_or(""), false),
346 'D' => (xsd10_non_digit_neg_body(), true),
347 'w' => (xsd10_word_char_body(), false),
348 'W' => (xsd10_non_word_char_body(), false),
349 _ => return false,
350 };
351 if body.is_empty() {
352 return false;
353 }
354 if in_class {
355 if negated {
356 return false;
357 }
358 out.push_str(body);
359 return true;
360 }
361 if negated {
362 out.push_str("[^");
363 } else {
364 out.push('[');
365 }
366 out.push_str(body);
367 out.push(']');
368 true
369}
370
371pub fn validate_xml_pattern_syntax(pattern: &str) -> Result<(), String> {
384 let chars: Vec<char> = pattern.chars().collect();
385 let mut index = 0;
386 while index < chars.len() {
387 match chars[index] {
388 '\\' => index = skip_escape(&chars, index + 1),
389 '[' => index = validate_char_class(&chars, index + 1)?,
390 _ => index += 1,
391 }
392 }
393 Ok(())
394}
395
396#[derive(Clone, Copy)]
397struct ClassAtom {
398 available_for_range: bool,
399 unescaped_hyphen: bool,
400}
401
402fn validate_char_class(chars: &[char], mut index: usize) -> Result<usize, String> {
403 let mut prev_atom: Option<ClassAtom> = None;
404 let mut at_group_start = true;
405 let mut allow_nested_class = false;
406
407 if chars.get(index) == Some(&'^') {
408 index += 1;
409 }
410
411 while index < chars.len() {
412 match chars[index] {
413 '\\' => {
414 let (is_single_char, next_index) = consume_class_escape(chars, index + 1);
415 prev_atom = Some(ClassAtom {
416 available_for_range: is_single_char,
417 unescaped_hyphen: false,
418 });
419 at_group_start = false;
420 allow_nested_class = false;
421 index = next_index;
422 }
423 '[' => {
424 if !allow_nested_class {
425 return Err("unescaped '[' in character class".to_string());
426 }
427 index = validate_char_class(chars, index + 1)?;
428 prev_atom = Some(ClassAtom {
429 available_for_range: false,
430 unescaped_hyphen: false,
431 });
432 at_group_start = false;
433 allow_nested_class = false;
434 }
435 ']' => return Ok(index + 1),
436 '-' => {
437 let next = chars.get(index + 1).copied();
438 let next_after = chars.get(index + 2).copied();
439
440 if next == Some('[') {
441 allow_nested_class = true;
442 prev_atom = None;
443 at_group_start = false;
444 index += 1;
445 continue;
446 }
447
448 if at_group_start
449 || next == Some(']')
450 || (next == Some('-') && next_after == Some('['))
451 {
452 prev_atom = Some(ClassAtom {
453 available_for_range: true,
454 unescaped_hyphen: true,
455 });
456 at_group_start = false;
457 allow_nested_class = false;
458 index += 1;
459 continue;
460 }
461
462 let Some(prev) = prev_atom else {
463 return Err("hyphen is not a valid character range operator".to_string());
464 };
465 if !prev.available_for_range || prev.unescaped_hyphen {
466 return Err("hyphen is not a valid character range operator".to_string());
467 }
468
469 let Some((range_end, next_index)) = peek_single_class_atom(chars, index + 1) else {
470 return Err("hyphen is not followed by a valid range endpoint".to_string());
471 };
472 if range_end.unescaped_hyphen {
473 return Err("unescaped hyphen cannot be a character range endpoint".to_string());
474 }
475
476 prev_atom = Some(ClassAtom {
477 available_for_range: false,
478 unescaped_hyphen: false,
479 });
480 at_group_start = false;
481 allow_nested_class = false;
482 index = next_index;
483 }
484 _ => {
485 prev_atom = Some(ClassAtom {
486 available_for_range: true,
487 unescaped_hyphen: false,
488 });
489 at_group_start = false;
490 allow_nested_class = false;
491 index += 1;
492 }
493 }
494 }
495
496 Err("unterminated character class".to_string())
497}
498
499fn skip_escape(chars: &[char], index: usize) -> usize {
500 if matches!(chars.get(index), Some('p' | 'P')) && chars.get(index + 1) == Some(&'{') {
501 let mut cursor = index + 2;
502 while cursor < chars.len() {
503 if chars[cursor] == '}' {
504 return cursor + 1;
505 }
506 cursor += 1;
507 }
508 return cursor;
509 }
510 index.saturating_add(1).min(chars.len())
511}
512
513fn consume_class_escape(chars: &[char], index: usize) -> (bool, usize) {
514 let is_single_char = matches!(
515 chars.get(index),
516 Some(
517 'n' | 'r'
518 | 't'
519 | '\\'
520 | '|'
521 | '.'
522 | '?'
523 | '*'
524 | '+'
525 | '('
526 | ')'
527 | '{'
528 | '}'
529 | '-'
530 | '['
531 | ']'
532 | '^'
533 )
534 );
535 (is_single_char, skip_escape(chars, index))
536}
537
538fn peek_single_class_atom(chars: &[char], index: usize) -> Option<(ClassAtom, usize)> {
539 match chars.get(index).copied()? {
540 '\\' => {
541 let (is_single_char, next_index) = consume_class_escape(chars, index + 1);
542 is_single_char.then_some((
543 ClassAtom {
544 available_for_range: false,
545 unescaped_hyphen: false,
546 },
547 next_index,
548 ))
549 }
550 '[' | ']' => None,
551 '-' => Some((
552 ClassAtom {
553 available_for_range: false,
554 unescaped_hyphen: true,
555 },
556 index + 1,
557 )),
558 _ => Some((
559 ClassAtom {
560 available_for_range: false,
561 unescaped_hyphen: false,
562 },
563 index + 1,
564 )),
565 }
566}
567
568fn xsd10_category_or_block_body(name: &str) -> Option<&'static str> {
579 if name == "IsPrivateUse" {
580 return Some(xsd10_private_use_block_body());
581 }
582 expand_xsd_category_body(name)
583}
584
585fn try_expand_category(out: &mut String, name: &str, negated: bool, in_class: bool) -> bool {
599 let Some(body) = xsd10_category_or_block_body(name) else {
600 return false;
601 };
602 if in_class {
603 if negated {
604 return false;
605 }
606 out.push_str(body);
607 return true;
608 }
609 if negated {
610 out.push_str("[^");
611 } else {
612 out.push('[');
613 }
614 out.push_str(body);
615 out.push(']');
616 true
617}
618
619fn handle_category_escape(
626 out: &mut String,
627 chars: &mut std::iter::Peekable<std::str::Chars<'_>>,
628 negated: bool,
629 in_class: bool,
630 try_expand: bool,
631) {
632 let marker = if negated { 'P' } else { 'p' };
633 if chars.peek() != Some(&'{') {
634 out.push('\\');
635 out.push(marker);
636 return;
637 }
638 chars.next();
639 let mut name = String::new();
640 let mut closed = false;
641 for c in chars.by_ref() {
642 if c == '}' {
643 closed = true;
644 break;
645 }
646 name.push(c);
647 }
648 if try_expand && closed && try_expand_category(out, &name, negated, in_class) {
649 return;
650 }
651 out.push('\\');
652 out.push(marker);
653 out.push('{');
654 out.push_str(&name);
655 if closed {
656 out.push('}');
657 }
658}
659
660#[cfg(test)]
661mod tests {
662 use super::*;
663 use regex::Regex;
664
665 #[test]
666 fn test_initial_name_char_escape() {
667 let result = convert_xml_pattern(r"\i", ConvertOptions::xpath());
668 assert_eq!(result, r"[A-Za-z_:]");
669 let regex = Regex::new(&result).unwrap();
670 assert!(regex.is_match("A"));
671 assert!(regex.is_match("_"));
672 assert!(!regex.is_match("1"));
673 }
674
675 #[test]
676 fn test_not_initial_name_char_escape() {
677 let result = convert_xml_pattern(r"\I", ConvertOptions::xpath());
678 assert_eq!(result, r"[^A-Za-z_:]");
679 let regex = Regex::new(&result).unwrap();
680 assert!(!regex.is_match("A"));
681 assert!(regex.is_match("1"));
682 assert!(regex.is_match(" "));
683 }
684
685 #[test]
686 fn test_name_char_escape() {
687 let result = convert_xml_pattern(r"\c", ConvertOptions::xpath());
688 assert_eq!(result, r"[A-Za-z0-9._:\-]");
689 let regex = Regex::new(&result).unwrap();
690 assert!(regex.is_match("A"));
691 assert!(regex.is_match("1"));
692 assert!(regex.is_match("-"));
693 assert!(!regex.is_match(" "));
694 }
695
696 #[test]
697 fn test_not_name_char_escape() {
698 let result = convert_xml_pattern(r"\C", ConvertOptions::xpath());
699 assert_eq!(result, r"[^A-Za-z0-9._:\-]");
700 let regex = Regex::new(&result).unwrap();
701 assert!(!regex.is_match("A"));
702 assert!(!regex.is_match("1"));
703 assert!(regex.is_match(" "));
704 }
705
706 #[test]
707 fn test_xsd_anchoring() {
708 let result = convert_xml_pattern("abc", ConvertOptions::xsd());
709 assert_eq!(result, "^abc$");
710 }
711
712 #[test]
713 fn test_xpath_no_anchoring() {
714 let result = convert_xml_pattern("abc", ConvertOptions::xpath());
715 assert_eq!(result, "abc");
716 }
717
718 #[test]
719 fn test_xml_name_pattern() {
720 let result = convert_xml_pattern(r"\i\c*", ConvertOptions::xsd());
721 assert_eq!(result, r"^[A-Za-z_:][A-Za-z0-9._:\-]*$");
722 let regex = Regex::new(&result).unwrap();
723 assert!(regex.is_match("foo"));
724 assert!(regex.is_match("foo:bar"));
725 assert!(regex.is_match("_bar"));
726 assert!(!regex.is_match("123"));
727 }
728
729 #[test]
730 fn test_standard_escapes_preserved() {
731 let result = convert_xml_pattern(r"\d+\s*\w+", ConvertOptions::xpath());
732 assert_eq!(result, r"\d+\s*\w+");
733 }
734
735 #[test]
736 fn test_v1_1_preserves_p_escape() {
737 let result = convert_xml_pattern(r"\p{L}\P{N}", ConvertOptions::xpath());
738 assert_eq!(result, r"\p{L}\P{N}");
739 }
740
741 #[test]
742 fn test_v1_0_expands_p_category_escape() {
743 let result = convert_xml_pattern(r"\p{Lu}*", ConvertOptions::xsd_v1_0());
744 assert!(result.starts_with("^["));
745 assert!(result.ends_with("]*$"));
746 assert!(!result.contains("\\p{"));
747 let regex = Regex::new(&result).unwrap();
748 assert!(regex.is_match("A"));
749 assert!(regex.is_match("ABC"));
750 assert!(!regex.is_match("a"));
751 let s = format!("A{}", char::from_u32(0x1D7A8).unwrap());
754 assert!(!regex.is_match(&s));
755 }
756
757 #[test]
758 fn test_v1_0_expands_negated_p_category_escape() {
759 let result = convert_xml_pattern(r"\P{N}*", ConvertOptions::xsd_v1_0());
760 assert!(result.contains("[^"));
761 assert!(!result.contains("\\P{"));
762 let regex = Regex::new(&result).unwrap();
763 assert!(regex.is_match("abc"));
764 assert!(!regex.is_match("123"));
765 }
766
767 #[test]
768 fn test_v1_0_passes_through_block_escape() {
769 let result = convert_xml_pattern(r"\p{IsBasicLatin}*", ConvertOptions::xsd_v1_0());
770 assert!(result.contains(r"\p{IsBasicLatin}"));
772 }
773
774 #[test]
775 fn test_v1_0_passes_through_unknown_category() {
776 let result = convert_xml_pattern(r"\p{Xx}", ConvertOptions::xsd_v1_0());
777 assert!(result.contains(r"\p{Xx}"));
778 }
779
780 #[test]
781 fn test_mixed_pattern() {
782 let result = convert_xml_pattern(r"\i\c*:\d+", ConvertOptions::xsd());
783 assert_eq!(result, r"^[A-Za-z_:][A-Za-z0-9._:\-]*:\d+$");
784 let regex = Regex::new(&result).unwrap();
785 assert!(regex.is_match("item:123"));
786 assert!(!regex.is_match("123:abc"));
787 }
788
789 #[test]
790 fn test_empty_pattern() {
791 let result = convert_xml_pattern("", ConvertOptions::xsd());
792 assert_eq!(result, "^$");
793
794 let result = convert_xml_pattern("", ConvertOptions::xpath());
795 assert_eq!(result, "");
796 }
797
798 #[test]
799 fn test_trailing_backslash() {
800 let result = convert_xml_pattern(r"abc\", ConvertOptions::xpath());
801 assert_eq!(result, r"abc\");
802 }
803
804 #[test]
805 fn test_rewrite_xsd10_expands_p_but_keeps_name_escapes() {
806 let result = rewrite_xsd10_category_escapes(r"\i\c*\p{Lu}+");
807 assert!(result.starts_with(r"\i\c*["), "unexpected: {}", result);
808 assert!(result.ends_with("]+"), "unexpected: {}", result);
809 assert!(!result.contains(r"\p{"));
810 }
811
812 #[test]
813 fn test_rewrite_xsd10_passes_block_escapes() {
814 let result = rewrite_xsd10_category_escapes(r"\p{IsBasicLatin}+");
815 assert_eq!(result, r"\p{IsBasicLatin}+");
816 }
817
818 #[test]
819 fn test_rewrite_xsd10_passes_unknown_names() {
820 let result = rewrite_xsd10_category_escapes(r"\p{Xx}");
821 assert_eq!(result, r"\p{Xx}");
822 }
823
824 #[test]
825 fn test_rewrite_xsd10_negated_category() {
826 let result = rewrite_xsd10_category_escapes(r"\P{N}+");
827 assert!(result.starts_with("[^"));
828 assert!(result.ends_with("]+"));
829 }
830
831 #[test]
832 fn test_validate_xsd10_character_class_hyphen_rules() {
833 for valid in [
834 r"[a-d]",
835 r"[-a]+",
836 r"[-]",
837 r"[a-]",
838 r"[a-\}-]+",
839 r"[a-z--[b-z]]",
840 r"[a-b-[0-9]]+",
841 ] {
842 assert!(
843 validate_xml_pattern_syntax(valid).is_ok(),
844 "expected valid XSD 1.0 regex: {valid}",
845 );
846 }
847
848 for invalid in [
852 r"[^a-d-b-c]",
853 r"[a-c-1-4x-z-7-9]*",
854 r"[a-a-x-x]+",
855 r"[a-z-+]*",
856 r"[a--b]",
857 r"[--z]",
858 ] {
859 assert!(
860 validate_xml_pattern_syntax(invalid).is_err(),
861 "expected invalid XSD 1.0 regex: {invalid}",
862 );
863 }
864 }
865
866 #[test]
867 fn lenient_ms_strips_inline_comments() {
868 assert_eq!(lenient_ms_preprocess("a(?#note)b"), "ab");
869 assert_eq!(lenient_ms_preprocess("(?#start)abc(?#end)"), "abc");
870 }
871
872 #[test]
873 fn lenient_ms_passthrough_when_clean() {
874 let p = "^abc[0-9]+$";
876 let result = lenient_ms_preprocess(p);
877 assert!(matches!(result, std::borrow::Cow::Borrowed(_)));
878 assert_eq!(result, p);
879 }
880
881 #[test]
882 fn lenient_ms_keeps_anchors_for_engine() {
883 assert_eq!(lenient_ms_preprocess("^abc$"), "^abc$");
886 assert_eq!(lenient_ms_preprocess("[^abc]"), "[^abc]");
887 }
888}