llguidance/
regex_rewrite.rs

1fn class_for(c: char) -> Option<&'static str> {
2    match c {
3        'd' => Some("0-9"),
4        'w' => Some("0-9a-zA-Z_"),
5        's' => Some(" \\t\\n\\r\\f\\v"),
6        _ => None,
7    }
8}
9
10/// Make sure given regex can be used inside /.../ in Lark syntax.
11/// Also if `use_ascii.contains('d')` replace `\d` with `[0-9]` and `\D` with `[^0-9]`.
12/// Similarly for `\w`/`\W` (`[0-9a-zA-Z_]`) and `\s`/`\S` (`[ \t\n\r\f\v]`).
13/// For standard Unicode Python3 or Rust regex crate semantics `use_ascii = ""`
14/// For JavaScript or JSON Schema semantics `use_ascii = "dw"`
15/// For Python2 or byte patters in Python3 semantics `use_ascii = "dws"`
16/// More flags may be added in future.
17pub fn regex_to_lark(rx: &str, use_ascii: &str) -> String {
18    let mut is_q = false;
19    let mut res = String::new();
20    for c in rx.chars() {
21        let prev_q = is_q;
22        is_q = false;
23        match c {
24            // make sure we don't terminate on /
25            '/' => res.push_str("\\/"),
26
27            // these are optional, but nice
28            '\n' => res.push_str("\\n"),
29            '\r' => res.push_str("\\r"),
30            '\t' => res.push_str("\\t"),
31
32            '\\' if !prev_q => {
33                is_q = true;
34            }
35
36            'd' | 'w' | 's' | 'D' | 'W' | 'S' if prev_q => {
37                let c2 = c.to_ascii_lowercase();
38                if use_ascii.contains(c2) {
39                    let class = class_for(c2).unwrap();
40                    res.push('[');
41                    if c != c2 {
42                        res.push('^');
43                    }
44                    res.push_str(class);
45                    res.push(']');
46                } else {
47                    res.push('\\');
48                    res.push(c);
49                }
50            }
51
52            _ => {
53                if prev_q {
54                    res.push('\\');
55                }
56                res.push(c);
57            }
58        }
59    }
60    res
61}
62
63#[cfg(test)]
64mod tests {
65    use super::*;
66
67    #[test]
68    fn test_digit_conversion_with_ascii() {
69        // \d => [0-9], \D => [^0-9]
70        assert_eq!(regex_to_lark(r"\d", "d"), "[0-9]");
71        assert_eq!(regex_to_lark(r"\D", "d"), "[^0-9]");
72    }
73
74    #[test]
75    fn test_word_conversion_with_ascii() {
76        // Only convert if use_ascii contains corresponding letter.
77        assert_eq!(regex_to_lark(r"\w", "w"), "[0-9a-zA-Z_]");
78        assert_eq!(regex_to_lark(r"\W", "w"), "[^0-9a-zA-Z_]");
79    }
80
81    #[test]
82    fn test_space_conversion_with_ascii() {
83        // \s and \S should convert accordingly.
84        assert_eq!(regex_to_lark(r"\s", "s"), "[ \\t\\n\\r\\f\\v]");
85        assert_eq!(regex_to_lark(r"\S", "s"), "[^ \\t\\n\\r\\f\\v]");
86    }
87
88    #[test]
89    fn test_no_conversion_when_missing_in_use_ascii() {
90        // If the ascii flag doesn't contain the letter, leave escape as-is.
91        assert_eq!(regex_to_lark(r"\d", ""), r"\d");
92        assert_eq!(regex_to_lark(r"\w", "d"), r"\w");
93    }
94
95    #[test]
96    fn test_escaped_slashes_and_whitespace() {
97        // '/' should be escaped; newline, tab, carriage return are escaped.
98        let input = "/a\nb\rc\td";
99        let expected = r"\/a\nb\rc\td";
100        assert_eq!(regex_to_lark(input, "dws"), expected);
101    }
102
103    #[test]
104    fn test_combined_conversions() {
105        // Combined sequence with all conversions.
106        let input = r"\d\w\s\D\W\S";
107        let expected = "[0-9][0-9a-zA-Z_][ \\t\\n\\r\\f\\v][^0-9][^0-9a-zA-Z_][^ \\t\\n\\r\\f\\v]";
108        assert_eq!(regex_to_lark(input, "dws"), expected);
109    }
110
111    #[test]
112    fn test_miscellaneous_escapes() {
113        // \X and \@ are not recognized as special, so they should pass through.
114        assert_eq!(regex_to_lark(r"\X", ""), r"\X");
115        assert_eq!(regex_to_lark(r"\@", ""), r"\@");
116
117        // Forward slash is escaped.
118        assert_eq!(regex_to_lark(r"/", ""), r"\/");
119        assert_eq!(regex_to_lark(r"\/", ""), r"\/");
120        assert_eq!(regex_to_lark(r"\//", ""), r"\/\/");
121        assert_eq!(regex_to_lark(r"/\//", ""), r"\/\/\/");
122
123        // Double backslash should be preserved.
124        assert_eq!(regex_to_lark(r"\\", ""), r"\\");
125
126        // Quotes should pass through unchanged.
127        assert_eq!(regex_to_lark("\"", ""), "\"");
128        assert_eq!(regex_to_lark(r#"a"b"#, ""), r#"a"b"#);
129    }
130}