grex/
regexp.rs

1/*
2 * Copyright © 2019-today Peter M. Stahl pemistahl@gmail.com
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17use crate::cluster::GraphemeCluster;
18use crate::component::Component;
19use crate::config::RegExpConfig;
20use crate::dfa::Dfa;
21use crate::expression::Expression;
22use itertools::Itertools;
23use regex::Regex;
24use std::cmp::Ordering;
25use std::fmt::{Display, Formatter, Result};
26
27pub struct RegExp<'a> {
28    ast: Expression<'a>,
29    config: &'a RegExpConfig,
30}
31
32impl<'a> RegExp<'a> {
33    pub(crate) fn from(test_cases: &'a mut Vec<String>, config: &'a RegExpConfig) -> Self {
34        if config.is_case_insensitive_matching {
35            Self::convert_for_case_insensitive_matching(test_cases);
36        }
37        Self::sort(test_cases);
38        let grapheme_clusters = Self::grapheme_clusters(test_cases, config);
39        let mut dfa = Dfa::from(&grapheme_clusters, true, config);
40        let mut ast = Expression::from(dfa, config);
41
42        if config.is_start_anchor_disabled && config.is_end_anchor_disabled {
43            let mut regex = Self::convert_expr_to_regex(&ast, config);
44
45            if config.is_verbose_mode_enabled {
46                // Remove line breaks before checking matches, otherwise check will be incorrect.
47                regex = Regex::new(&regex.to_string().replace('\n', "")).unwrap();
48            }
49
50            if !Self::is_each_test_case_matched_after_rotating_alternations(
51                &regex, &mut ast, test_cases,
52            ) {
53                dfa = Dfa::from(&grapheme_clusters, false, config);
54                ast = Expression::from(dfa, config);
55                regex = Self::convert_expr_to_regex(&ast, config);
56
57                if !Self::regex_matches_all_test_cases(&regex, test_cases) {
58                    let mut exprs = vec![];
59                    for cluster in grapheme_clusters {
60                        let literal = Expression::new_literal(cluster, config);
61                        exprs.push(literal);
62                    }
63                    ast = Expression::new_alternation(exprs, config);
64                }
65            }
66        }
67
68        Self { ast, config }
69    }
70
71    fn convert_for_case_insensitive_matching(test_cases: &mut Vec<String>) {
72        // Convert only those test cases to lowercase if
73        // they keep their original number of characters.
74        // Otherwise, "İ" -> "i\u{307}" would not match "İ".
75        *test_cases = test_cases
76            .iter()
77            .map(|it| {
78                let lower_test_case = it.to_lowercase();
79                if lower_test_case.chars().count() == it.chars().count() {
80                    lower_test_case
81                } else {
82                    it.to_string()
83                }
84            })
85            .collect_vec();
86    }
87
88    fn convert_expr_to_regex(expr: &Expression, config: &RegExpConfig) -> Regex {
89        if config.is_output_colorized {
90            let color_replace_regex = Regex::new("\u{1b}\\[(?:\\d+;\\d+|0)m").unwrap();
91            Regex::new(&color_replace_regex.replace_all(&expr.to_string(), "")).unwrap()
92        } else {
93            Regex::new(&expr.to_string()).unwrap()
94        }
95    }
96
97    fn regex_matches_all_test_cases(regex: &Regex, test_cases: &[String]) -> bool {
98        test_cases
99            .iter()
100            .all(|test_case| regex.find_iter(test_case).count() == 1)
101    }
102
103    fn sort(test_cases: &mut Vec<String>) {
104        test_cases.sort();
105        test_cases.dedup();
106        test_cases.sort_by(|a, b| match a.len().cmp(&b.len()) {
107            Ordering::Equal => a.cmp(b),
108            other => other,
109        });
110    }
111
112    fn grapheme_clusters(
113        test_cases: &'a [String],
114        config: &'a RegExpConfig,
115    ) -> Vec<GraphemeCluster<'a>> {
116        let mut clusters = test_cases
117            .iter()
118            .map(|it| GraphemeCluster::from(it, config))
119            .collect_vec();
120
121        if config.is_char_class_feature_enabled() {
122            for cluster in clusters.iter_mut() {
123                cluster.convert_to_char_classes();
124            }
125        }
126
127        if config.is_repetition_converted {
128            for cluster in clusters.iter_mut() {
129                cluster.convert_repetitions();
130            }
131        }
132
133        clusters
134    }
135
136    fn is_each_test_case_matched_after_rotating_alternations(
137        regex: &Regex,
138        expr: &mut Expression,
139        test_cases: &[String],
140    ) -> bool {
141        for _ in 1..test_cases.len() {
142            if Self::regex_matches_all_test_cases(regex, test_cases) {
143                return true;
144            } else if let Expression::Alternation(options, _, _, _) = expr {
145                options.rotate_right(1);
146            } else if let Expression::Concatenation(first, second, _, _, _) = expr {
147                let a: &mut Expression = first;
148                let b: &mut Expression = second;
149
150                if let Expression::Alternation(options, _, _, _) = a {
151                    options.rotate_right(1);
152                } else if let Expression::Alternation(options, _, _, _) = b {
153                    options.rotate_right(1);
154                }
155            }
156        }
157        false
158    }
159}
160
161impl Display for RegExp<'_> {
162    fn fmt(&self, f: &mut Formatter<'_>) -> Result {
163        let flag =
164            if self.config.is_case_insensitive_matching && self.config.is_verbose_mode_enabled {
165                Component::IgnoreCaseAndVerboseModeFlag.to_repr(self.config.is_output_colorized)
166            } else if self.config.is_case_insensitive_matching {
167                Component::IgnoreCaseFlag.to_repr(self.config.is_output_colorized)
168            } else if self.config.is_verbose_mode_enabled {
169                Component::VerboseModeFlag.to_repr(self.config.is_output_colorized)
170            } else {
171                String::new()
172            };
173
174        let caret = if self.config.is_start_anchor_disabled {
175            String::new()
176        } else {
177            Component::Caret(self.config.is_verbose_mode_enabled)
178                .to_repr(self.config.is_output_colorized)
179        };
180
181        let dollar_sign = if self.config.is_end_anchor_disabled {
182            String::new()
183        } else {
184            Component::DollarSign(self.config.is_verbose_mode_enabled)
185                .to_repr(self.config.is_output_colorized)
186        };
187
188        let mut regexp = match self.ast {
189            Expression::Alternation(_, _, _, _) => {
190                format!(
191                    "{}{}{}{}",
192                    flag,
193                    caret,
194                    if self.config.is_capturing_group_enabled {
195                        Component::CapturedParenthesizedExpression(
196                            self.ast.to_string(),
197                            self.config.is_verbose_mode_enabled,
198                            false,
199                        )
200                        .to_repr(self.config.is_output_colorized)
201                    } else {
202                        Component::UncapturedParenthesizedExpression(
203                            self.ast.to_string(),
204                            self.config.is_verbose_mode_enabled,
205                            false,
206                        )
207                        .to_repr(self.config.is_output_colorized)
208                    },
209                    dollar_sign
210                )
211            }
212            _ => {
213                format!("{}{}{}{}", flag, caret, self.ast, dollar_sign)
214            }
215        };
216
217        regexp = regexp
218            .replace('\u{b}', "\\v") // U+000B Line Tabulation
219            .replace('\u{c}', "\\f"); // U+000C Form Feed
220
221        if self.config.is_verbose_mode_enabled {
222            regexp = regexp
223                .replace('#', "\\#")
224                .replace(
225                    [
226                        ' ', ' ', ' ', ' ', ' ', ' ', ' ', '\u{85}', '\u{a0}', '\u{1680}',
227                        '\u{2000}', '\u{2001}', '\u{2002}', '\u{2003}', '\u{2004}', '\u{2005}',
228                        '\u{2006}', '\u{2007}', '\u{2008}', '\u{2009}', '\u{200a}', '\u{2028}',
229                        '\u{2029}', '\u{202f}', '\u{205f}', '\u{3000}',
230                    ],
231                    "\\s",
232                )
233                .replace(' ', "\\ ");
234        }
235
236        write!(
237            f,
238            "{}",
239            if self.config.is_verbose_mode_enabled {
240                indent_regexp(regexp, self.config)
241            } else {
242                regexp
243            }
244        )
245    }
246}
247
248fn indent_regexp(regexp: String, config: &RegExpConfig) -> String {
249    let mut indented_regexp = vec![];
250    let mut nesting_level = 0;
251
252    for (i, line) in regexp.lines().enumerate() {
253        if i == 1 && config.is_start_anchor_disabled {
254            nesting_level += 1;
255        }
256        if line.is_empty() {
257            continue;
258        }
259
260        let is_colored_line = line.starts_with("\u{1b}[");
261
262        if nesting_level > 0
263            && ((is_colored_line && (line.contains('$') || line.contains(')')))
264                || (line == "$" || line.starts_with(')')))
265        {
266            nesting_level -= 1;
267        }
268
269        let indentation = "  ".repeat(nesting_level);
270        indented_regexp.push(format!("{indentation}{line}"));
271
272        if (is_colored_line && (line.contains('^') || (i > 0 && line.contains('('))))
273            || (line == "^" || (i > 0 && line.starts_with('(')))
274        {
275            nesting_level += 1;
276        }
277    }
278
279    indented_regexp.join("\n")
280}