use crate::regex_xsd_unicode::{
expand_xsd_category_body, xsd10_non_digit_neg_body, xsd10_non_word_char_body,
xsd10_private_use_block_body, xsd10_word_char_body,
};
use crate::schema::model::XsdVersion;
#[derive(Debug, Clone, Copy)]
pub struct ConvertOptions {
pub anchor: bool,
pub xsd_version: XsdVersion,
}
impl Default for ConvertOptions {
fn default() -> Self {
Self {
anchor: false,
xsd_version: XsdVersion::V1_1,
}
}
}
impl ConvertOptions {
pub fn xsd() -> Self {
Self {
anchor: true,
xsd_version: XsdVersion::V1_1,
}
}
pub fn xsd_v1_0() -> Self {
Self {
anchor: true,
xsd_version: XsdVersion::V1_0,
}
}
pub fn xpath() -> Self {
Self {
anchor: false,
xsd_version: XsdVersion::V1_1,
}
}
}
pub fn lenient_ms_preprocess(pattern: &str) -> std::borrow::Cow<'_, str> {
if !pattern.contains("(?#") {
return std::borrow::Cow::Borrowed(pattern);
}
std::borrow::Cow::Owned(strip_inline_comments(pattern))
}
fn strip_inline_comments(pattern: &str) -> String {
let mut out = String::with_capacity(pattern.len());
let mut in_class = false;
let mut chars = pattern.char_indices().peekable();
while let Some((idx, ch)) = chars.next() {
if ch == '\\' {
out.push(ch);
if let Some((_, next)) = chars.next() {
out.push(next);
}
continue;
}
if ch == '[' {
in_class = true;
out.push(ch);
continue;
}
if ch == ']' {
in_class = false;
out.push(ch);
continue;
}
if !in_class && ch == '(' && pattern[idx..].starts_with("(?#") {
let after = idx + "(?#".len();
let remainder = &pattern[after..];
let mut close = None;
let mut j = 0;
let rb = remainder.as_bytes();
while j < rb.len() {
if rb[j] == b'\\' && j + 1 < rb.len() {
j += 2;
continue;
}
if rb[j] == b')' {
close = Some(j);
break;
}
j += 1;
}
if let Some(c) = close {
let consume_to = after + c + 1;
while let Some(&(next_idx, _)) = chars.peek() {
if next_idx < consume_to {
chars.next();
} else {
break;
}
}
continue;
}
}
out.push(ch);
}
out
}
pub fn convert_xml_pattern(pattern: &str, options: ConvertOptions) -> String {
let extra_capacity = if options.anchor { 4 } else { 0 };
let initial_capacity = match options.xsd_version {
XsdVersion::V1_0 => pattern.len() * 4 + extra_capacity,
XsdVersion::V1_1 => pattern.len() + extra_capacity,
};
let mut result = String::with_capacity(initial_capacity);
if options.anchor {
result.push('^');
}
let mut in_class = false;
let mut chars = pattern.chars().peekable();
while let Some(ch) = chars.next() {
if ch == '\\' {
let Some(&next) = chars.peek() else {
result.push('\\');
continue;
};
match next {
'i' => {
chars.next();
result.push_str(r"[A-Za-z_:]");
}
'I' => {
chars.next();
result.push_str(r"[^A-Za-z_:]");
}
'c' => {
chars.next();
result.push_str(r"[A-Za-z0-9._:\-]");
}
'C' => {
chars.next();
result.push_str(r"[^A-Za-z0-9._:\-]");
}
'd' | 'D' | 'w' | 'W'
if options.xsd_version == XsdVersion::V1_0
&& expand_xsd10_class_escape(&mut result, next, in_class) =>
{
chars.next();
}
'd' | 'D' | 's' | 'S' | 'w' | 'W' | 'n' | 'r' | 't' | '\\' | '|' | '.' | '?'
| '*' | '+' | '{' | '}' | '(' | ')' | '[' | ']' | '^' | '$' | '-' => {
result.push('\\');
result.push(next);
chars.next();
}
'p' | 'P' => {
let negated = next == 'P';
chars.next();
handle_category_escape(
&mut result,
&mut chars,
negated,
in_class,
options.xsd_version == XsdVersion::V1_0,
);
}
_ => {
result.push('\\');
result.push(next);
chars.next();
}
}
} else {
if ch == '[' {
in_class = true;
} else if ch == ']' {
in_class = false;
}
result.push(ch);
}
}
if options.anchor {
result.push('$');
}
result
}
pub fn rewrite_xsd10_category_escapes(pattern: &str) -> String {
let mut result = String::with_capacity(pattern.len() * 4);
let mut in_class = false;
let mut chars = pattern.chars().peekable();
while let Some(ch) = chars.next() {
if ch != '\\' {
if ch == '[' {
in_class = true;
} else if ch == ']' {
in_class = false;
}
result.push(ch);
continue;
}
let Some(&next) = chars.peek() else {
result.push('\\');
continue;
};
if matches!(next, 'd' | 'D' | 'w' | 'W')
&& expand_xsd10_class_escape(&mut result, next, in_class)
{
chars.next();
continue;
}
if next != 'p' && next != 'P' {
result.push('\\');
result.push(next);
chars.next();
continue;
}
let negated = next == 'P';
chars.next();
handle_category_escape(&mut result, &mut chars, negated, in_class, true);
}
result
}
fn expand_xsd10_class_escape(out: &mut String, escape: char, in_class: bool) -> bool {
let (body, negated): (&str, bool) = match escape {
'd' => (expand_xsd_category_body("Nd").unwrap_or(""), false),
'D' => (xsd10_non_digit_neg_body(), true),
'w' => (xsd10_word_char_body(), false),
'W' => (xsd10_non_word_char_body(), false),
_ => return false,
};
if body.is_empty() {
return false;
}
if in_class {
if negated {
return false;
}
out.push_str(body);
return true;
}
if negated {
out.push_str("[^");
} else {
out.push('[');
}
out.push_str(body);
out.push(']');
true
}
pub fn validate_xml_pattern_syntax(pattern: &str) -> Result<(), String> {
let chars: Vec<char> = pattern.chars().collect();
let mut index = 0;
while index < chars.len() {
match chars[index] {
'\\' => index = skip_escape(&chars, index + 1),
'[' => index = validate_char_class(&chars, index + 1)?,
_ => index += 1,
}
}
Ok(())
}
#[derive(Clone, Copy)]
struct ClassAtom {
available_for_range: bool,
unescaped_hyphen: bool,
}
fn validate_char_class(chars: &[char], mut index: usize) -> Result<usize, String> {
let mut prev_atom: Option<ClassAtom> = None;
let mut at_group_start = true;
let mut allow_nested_class = false;
if chars.get(index) == Some(&'^') {
index += 1;
}
while index < chars.len() {
match chars[index] {
'\\' => {
let (is_single_char, next_index) = consume_class_escape(chars, index + 1);
prev_atom = Some(ClassAtom {
available_for_range: is_single_char,
unescaped_hyphen: false,
});
at_group_start = false;
allow_nested_class = false;
index = next_index;
}
'[' => {
if !allow_nested_class {
return Err("unescaped '[' in character class".to_string());
}
index = validate_char_class(chars, index + 1)?;
prev_atom = Some(ClassAtom {
available_for_range: false,
unescaped_hyphen: false,
});
at_group_start = false;
allow_nested_class = false;
}
']' => return Ok(index + 1),
'-' => {
let next = chars.get(index + 1).copied();
let next_after = chars.get(index + 2).copied();
if next == Some('[') {
allow_nested_class = true;
prev_atom = None;
at_group_start = false;
index += 1;
continue;
}
if at_group_start
|| next == Some(']')
|| (next == Some('-') && next_after == Some('['))
{
prev_atom = Some(ClassAtom {
available_for_range: true,
unescaped_hyphen: true,
});
at_group_start = false;
allow_nested_class = false;
index += 1;
continue;
}
let Some(prev) = prev_atom else {
return Err("hyphen is not a valid character range operator".to_string());
};
if !prev.available_for_range || prev.unescaped_hyphen {
return Err("hyphen is not a valid character range operator".to_string());
}
let Some((range_end, next_index)) = peek_single_class_atom(chars, index + 1) else {
return Err("hyphen is not followed by a valid range endpoint".to_string());
};
if range_end.unescaped_hyphen {
return Err("unescaped hyphen cannot be a character range endpoint".to_string());
}
prev_atom = Some(ClassAtom {
available_for_range: false,
unescaped_hyphen: false,
});
at_group_start = false;
allow_nested_class = false;
index = next_index;
}
_ => {
prev_atom = Some(ClassAtom {
available_for_range: true,
unescaped_hyphen: false,
});
at_group_start = false;
allow_nested_class = false;
index += 1;
}
}
}
Err("unterminated character class".to_string())
}
fn skip_escape(chars: &[char], index: usize) -> usize {
if matches!(chars.get(index), Some('p' | 'P')) && chars.get(index + 1) == Some(&'{') {
let mut cursor = index + 2;
while cursor < chars.len() {
if chars[cursor] == '}' {
return cursor + 1;
}
cursor += 1;
}
return cursor;
}
index.saturating_add(1).min(chars.len())
}
fn consume_class_escape(chars: &[char], index: usize) -> (bool, usize) {
let is_single_char = matches!(
chars.get(index),
Some(
'n' | 'r'
| 't'
| '\\'
| '|'
| '.'
| '?'
| '*'
| '+'
| '('
| ')'
| '{'
| '}'
| '-'
| '['
| ']'
| '^'
)
);
(is_single_char, skip_escape(chars, index))
}
fn peek_single_class_atom(chars: &[char], index: usize) -> Option<(ClassAtom, usize)> {
match chars.get(index).copied()? {
'\\' => {
let (is_single_char, next_index) = consume_class_escape(chars, index + 1);
is_single_char.then_some((
ClassAtom {
available_for_range: false,
unescaped_hyphen: false,
},
next_index,
))
}
'[' | ']' => None,
'-' => Some((
ClassAtom {
available_for_range: false,
unescaped_hyphen: true,
},
index + 1,
)),
_ => Some((
ClassAtom {
available_for_range: false,
unescaped_hyphen: false,
},
index + 1,
)),
}
}
fn xsd10_category_or_block_body(name: &str) -> Option<&'static str> {
if name == "IsPrivateUse" {
return Some(xsd10_private_use_block_body());
}
expand_xsd_category_body(name)
}
fn try_expand_category(out: &mut String, name: &str, negated: bool, in_class: bool) -> bool {
let Some(body) = xsd10_category_or_block_body(name) else {
return false;
};
if in_class {
if negated {
return false;
}
out.push_str(body);
return true;
}
if negated {
out.push_str("[^");
} else {
out.push('[');
}
out.push_str(body);
out.push(']');
true
}
fn handle_category_escape(
out: &mut String,
chars: &mut std::iter::Peekable<std::str::Chars<'_>>,
negated: bool,
in_class: bool,
try_expand: bool,
) {
let marker = if negated { 'P' } else { 'p' };
if chars.peek() != Some(&'{') {
out.push('\\');
out.push(marker);
return;
}
chars.next();
let mut name = String::new();
let mut closed = false;
for c in chars.by_ref() {
if c == '}' {
closed = true;
break;
}
name.push(c);
}
if try_expand && closed && try_expand_category(out, &name, negated, in_class) {
return;
}
out.push('\\');
out.push(marker);
out.push('{');
out.push_str(&name);
if closed {
out.push('}');
}
}
#[cfg(test)]
mod tests {
use super::*;
use regex::Regex;
#[test]
fn test_initial_name_char_escape() {
let result = convert_xml_pattern(r"\i", ConvertOptions::xpath());
assert_eq!(result, r"[A-Za-z_:]");
let regex = Regex::new(&result).unwrap();
assert!(regex.is_match("A"));
assert!(regex.is_match("_"));
assert!(!regex.is_match("1"));
}
#[test]
fn test_not_initial_name_char_escape() {
let result = convert_xml_pattern(r"\I", ConvertOptions::xpath());
assert_eq!(result, r"[^A-Za-z_:]");
let regex = Regex::new(&result).unwrap();
assert!(!regex.is_match("A"));
assert!(regex.is_match("1"));
assert!(regex.is_match(" "));
}
#[test]
fn test_name_char_escape() {
let result = convert_xml_pattern(r"\c", ConvertOptions::xpath());
assert_eq!(result, r"[A-Za-z0-9._:\-]");
let regex = Regex::new(&result).unwrap();
assert!(regex.is_match("A"));
assert!(regex.is_match("1"));
assert!(regex.is_match("-"));
assert!(!regex.is_match(" "));
}
#[test]
fn test_not_name_char_escape() {
let result = convert_xml_pattern(r"\C", ConvertOptions::xpath());
assert_eq!(result, r"[^A-Za-z0-9._:\-]");
let regex = Regex::new(&result).unwrap();
assert!(!regex.is_match("A"));
assert!(!regex.is_match("1"));
assert!(regex.is_match(" "));
}
#[test]
fn test_xsd_anchoring() {
let result = convert_xml_pattern("abc", ConvertOptions::xsd());
assert_eq!(result, "^abc$");
}
#[test]
fn test_xpath_no_anchoring() {
let result = convert_xml_pattern("abc", ConvertOptions::xpath());
assert_eq!(result, "abc");
}
#[test]
fn test_xml_name_pattern() {
let result = convert_xml_pattern(r"\i\c*", ConvertOptions::xsd());
assert_eq!(result, r"^[A-Za-z_:][A-Za-z0-9._:\-]*$");
let regex = Regex::new(&result).unwrap();
assert!(regex.is_match("foo"));
assert!(regex.is_match("foo:bar"));
assert!(regex.is_match("_bar"));
assert!(!regex.is_match("123"));
}
#[test]
fn test_standard_escapes_preserved() {
let result = convert_xml_pattern(r"\d+\s*\w+", ConvertOptions::xpath());
assert_eq!(result, r"\d+\s*\w+");
}
#[test]
fn test_v1_1_preserves_p_escape() {
let result = convert_xml_pattern(r"\p{L}\P{N}", ConvertOptions::xpath());
assert_eq!(result, r"\p{L}\P{N}");
}
#[test]
fn test_v1_0_expands_p_category_escape() {
let result = convert_xml_pattern(r"\p{Lu}*", ConvertOptions::xsd_v1_0());
assert!(result.starts_with("^["));
assert!(result.ends_with("]*$"));
assert!(!result.contains("\\p{"));
let regex = Regex::new(&result).unwrap();
assert!(regex.is_match("A"));
assert!(regex.is_match("ABC"));
assert!(!regex.is_match("a"));
let s = format!("A{}", char::from_u32(0x1D7A8).unwrap());
assert!(!regex.is_match(&s));
}
#[test]
fn test_v1_0_expands_negated_p_category_escape() {
let result = convert_xml_pattern(r"\P{N}*", ConvertOptions::xsd_v1_0());
assert!(result.contains("[^"));
assert!(!result.contains("\\P{"));
let regex = Regex::new(&result).unwrap();
assert!(regex.is_match("abc"));
assert!(!regex.is_match("123"));
}
#[test]
fn test_v1_0_passes_through_block_escape() {
let result = convert_xml_pattern(r"\p{IsBasicLatin}*", ConvertOptions::xsd_v1_0());
assert!(result.contains(r"\p{IsBasicLatin}"));
}
#[test]
fn test_v1_0_passes_through_unknown_category() {
let result = convert_xml_pattern(r"\p{Xx}", ConvertOptions::xsd_v1_0());
assert!(result.contains(r"\p{Xx}"));
}
#[test]
fn test_mixed_pattern() {
let result = convert_xml_pattern(r"\i\c*:\d+", ConvertOptions::xsd());
assert_eq!(result, r"^[A-Za-z_:][A-Za-z0-9._:\-]*:\d+$");
let regex = Regex::new(&result).unwrap();
assert!(regex.is_match("item:123"));
assert!(!regex.is_match("123:abc"));
}
#[test]
fn test_empty_pattern() {
let result = convert_xml_pattern("", ConvertOptions::xsd());
assert_eq!(result, "^$");
let result = convert_xml_pattern("", ConvertOptions::xpath());
assert_eq!(result, "");
}
#[test]
fn test_trailing_backslash() {
let result = convert_xml_pattern(r"abc\", ConvertOptions::xpath());
assert_eq!(result, r"abc\");
}
#[test]
fn test_rewrite_xsd10_expands_p_but_keeps_name_escapes() {
let result = rewrite_xsd10_category_escapes(r"\i\c*\p{Lu}+");
assert!(result.starts_with(r"\i\c*["), "unexpected: {}", result);
assert!(result.ends_with("]+"), "unexpected: {}", result);
assert!(!result.contains(r"\p{"));
}
#[test]
fn test_rewrite_xsd10_passes_block_escapes() {
let result = rewrite_xsd10_category_escapes(r"\p{IsBasicLatin}+");
assert_eq!(result, r"\p{IsBasicLatin}+");
}
#[test]
fn test_rewrite_xsd10_passes_unknown_names() {
let result = rewrite_xsd10_category_escapes(r"\p{Xx}");
assert_eq!(result, r"\p{Xx}");
}
#[test]
fn test_rewrite_xsd10_negated_category() {
let result = rewrite_xsd10_category_escapes(r"\P{N}+");
assert!(result.starts_with("[^"));
assert!(result.ends_with("]+"));
}
#[test]
fn test_validate_xsd10_character_class_hyphen_rules() {
for valid in [
r"[a-d]",
r"[-a]+",
r"[-]",
r"[a-]",
r"[a-\}-]+",
r"[a-z--[b-z]]",
r"[a-b-[0-9]]+",
] {
assert!(
validate_xml_pattern_syntax(valid).is_ok(),
"expected valid XSD 1.0 regex: {valid}",
);
}
for invalid in [
r"[^a-d-b-c]",
r"[a-c-1-4x-z-7-9]*",
r"[a-a-x-x]+",
r"[a-z-+]*",
r"[a--b]",
r"[--z]",
] {
assert!(
validate_xml_pattern_syntax(invalid).is_err(),
"expected invalid XSD 1.0 regex: {invalid}",
);
}
}
#[test]
fn lenient_ms_strips_inline_comments() {
assert_eq!(lenient_ms_preprocess("a(?#note)b"), "ab");
assert_eq!(lenient_ms_preprocess("(?#start)abc(?#end)"), "abc");
}
#[test]
fn lenient_ms_passthrough_when_clean() {
let p = "^abc[0-9]+$";
let result = lenient_ms_preprocess(p);
assert!(matches!(result, std::borrow::Cow::Borrowed(_)));
assert_eq!(result, p);
}
#[test]
fn lenient_ms_keeps_anchors_for_engine() {
assert_eq!(lenient_ms_preprocess("^abc$"), "^abc$");
assert_eq!(lenient_ms_preprocess("[^abc]"), "[^abc]");
}
}