1use crate::cluster::GraphemeCluster;
18use crate::component::Component;
19use crate::config::RegExpConfig;
20use crate::dfa::Dfa;
21use crate::expression::Expression;
22use itertools::Itertools;
23use regex::Regex;
24use std::cmp::Ordering;
25use std::fmt::{Display, Formatter, Result};
26
27pub struct RegExp<'a> {
28 ast: Expression<'a>,
29 config: &'a RegExpConfig,
30}
31
32impl<'a> RegExp<'a> {
33 pub(crate) fn from(test_cases: &'a mut Vec<String>, config: &'a RegExpConfig) -> Self {
34 if config.is_case_insensitive_matching {
35 Self::convert_for_case_insensitive_matching(test_cases);
36 }
37 Self::sort(test_cases);
38 let grapheme_clusters = Self::grapheme_clusters(test_cases, config);
39 let mut dfa = Dfa::from(&grapheme_clusters, true, config);
40 let mut ast = Expression::from(dfa, config);
41
42 if config.is_start_anchor_disabled && config.is_end_anchor_disabled {
43 let mut regex = Self::convert_expr_to_regex(&ast, config);
44
45 if config.is_verbose_mode_enabled {
46 regex = Regex::new(®ex.to_string().replace('\n', "")).unwrap();
48 }
49
50 if !Self::is_each_test_case_matched_after_rotating_alternations(
51 ®ex, &mut ast, test_cases,
52 ) {
53 dfa = Dfa::from(&grapheme_clusters, false, config);
54 ast = Expression::from(dfa, config);
55 regex = Self::convert_expr_to_regex(&ast, config);
56
57 if !Self::regex_matches_all_test_cases(®ex, test_cases) {
58 let mut exprs = vec![];
59 for cluster in grapheme_clusters {
60 let literal = Expression::new_literal(cluster, config);
61 exprs.push(literal);
62 }
63 ast = Expression::new_alternation(exprs, config);
64 }
65 }
66 }
67
68 Self { ast, config }
69 }
70
71 fn convert_for_case_insensitive_matching(test_cases: &mut Vec<String>) {
72 *test_cases = test_cases
76 .iter()
77 .map(|it| {
78 let lower_test_case = it.to_lowercase();
79 if lower_test_case.chars().count() == it.chars().count() {
80 lower_test_case
81 } else {
82 it.to_string()
83 }
84 })
85 .collect_vec();
86 }
87
88 fn convert_expr_to_regex(expr: &Expression, config: &RegExpConfig) -> Regex {
89 if config.is_output_colorized {
90 let color_replace_regex = Regex::new("\u{1b}\\[(?:\\d+;\\d+|0)m").unwrap();
91 Regex::new(&color_replace_regex.replace_all(&expr.to_string(), "")).unwrap()
92 } else {
93 Regex::new(&expr.to_string()).unwrap()
94 }
95 }
96
97 fn regex_matches_all_test_cases(regex: &Regex, test_cases: &[String]) -> bool {
98 test_cases
99 .iter()
100 .all(|test_case| regex.find_iter(test_case).count() == 1)
101 }
102
103 fn sort(test_cases: &mut Vec<String>) {
104 test_cases.sort();
105 test_cases.dedup();
106 test_cases.sort_by(|a, b| match a.len().cmp(&b.len()) {
107 Ordering::Equal => a.cmp(b),
108 other => other,
109 });
110 }
111
112 fn grapheme_clusters(
113 test_cases: &'a [String],
114 config: &'a RegExpConfig,
115 ) -> Vec<GraphemeCluster<'a>> {
116 let mut clusters = test_cases
117 .iter()
118 .map(|it| GraphemeCluster::from(it, config))
119 .collect_vec();
120
121 if config.is_char_class_feature_enabled() {
122 for cluster in clusters.iter_mut() {
123 cluster.convert_to_char_classes();
124 }
125 }
126
127 if config.is_repetition_converted {
128 for cluster in clusters.iter_mut() {
129 cluster.convert_repetitions();
130 }
131 }
132
133 clusters
134 }
135
136 fn is_each_test_case_matched_after_rotating_alternations(
137 regex: &Regex,
138 expr: &mut Expression,
139 test_cases: &[String],
140 ) -> bool {
141 for _ in 1..test_cases.len() {
142 if Self::regex_matches_all_test_cases(regex, test_cases) {
143 return true;
144 } else if let Expression::Alternation(options, _, _, _) = expr {
145 options.rotate_right(1);
146 } else if let Expression::Concatenation(first, second, _, _, _) = expr {
147 let a: &mut Expression = first;
148 let b: &mut Expression = second;
149
150 if let Expression::Alternation(options, _, _, _) = a {
151 options.rotate_right(1);
152 } else if let Expression::Alternation(options, _, _, _) = b {
153 options.rotate_right(1);
154 }
155 }
156 }
157 false
158 }
159}
160
161impl Display for RegExp<'_> {
162 fn fmt(&self, f: &mut Formatter<'_>) -> Result {
163 let flag =
164 if self.config.is_case_insensitive_matching && self.config.is_verbose_mode_enabled {
165 Component::IgnoreCaseAndVerboseModeFlag.to_repr(self.config.is_output_colorized)
166 } else if self.config.is_case_insensitive_matching {
167 Component::IgnoreCaseFlag.to_repr(self.config.is_output_colorized)
168 } else if self.config.is_verbose_mode_enabled {
169 Component::VerboseModeFlag.to_repr(self.config.is_output_colorized)
170 } else {
171 String::new()
172 };
173
174 let caret = if self.config.is_start_anchor_disabled {
175 String::new()
176 } else {
177 Component::Caret(self.config.is_verbose_mode_enabled)
178 .to_repr(self.config.is_output_colorized)
179 };
180
181 let dollar_sign = if self.config.is_end_anchor_disabled {
182 String::new()
183 } else {
184 Component::DollarSign(self.config.is_verbose_mode_enabled)
185 .to_repr(self.config.is_output_colorized)
186 };
187
188 let mut regexp = match self.ast {
189 Expression::Alternation(_, _, _, _) => {
190 format!(
191 "{}{}{}{}",
192 flag,
193 caret,
194 if self.config.is_capturing_group_enabled {
195 Component::CapturedParenthesizedExpression(
196 self.ast.to_string(),
197 self.config.is_verbose_mode_enabled,
198 false,
199 )
200 .to_repr(self.config.is_output_colorized)
201 } else {
202 Component::UncapturedParenthesizedExpression(
203 self.ast.to_string(),
204 self.config.is_verbose_mode_enabled,
205 false,
206 )
207 .to_repr(self.config.is_output_colorized)
208 },
209 dollar_sign
210 )
211 }
212 _ => {
213 format!("{}{}{}{}", flag, caret, self.ast, dollar_sign)
214 }
215 };
216
217 regexp = regexp
218 .replace('\u{b}', "\\v") .replace('\u{c}', "\\f"); if self.config.is_verbose_mode_enabled {
222 regexp = regexp
223 .replace('#', "\\#")
224 .replace(
225 [
226 ' ', ' ', ' ', ' ', ' ', ' ', ' ', '\u{85}', '\u{a0}', '\u{1680}',
227 '\u{2000}', '\u{2001}', '\u{2002}', '\u{2003}', '\u{2004}', '\u{2005}',
228 '\u{2006}', '\u{2007}', '\u{2008}', '\u{2009}', '\u{200a}', '\u{2028}',
229 '\u{2029}', '\u{202f}', '\u{205f}', '\u{3000}',
230 ],
231 "\\s",
232 )
233 .replace(' ', "\\ ");
234 }
235
236 write!(
237 f,
238 "{}",
239 if self.config.is_verbose_mode_enabled {
240 indent_regexp(regexp, self.config)
241 } else {
242 regexp
243 }
244 )
245 }
246}
247
248fn indent_regexp(regexp: String, config: &RegExpConfig) -> String {
249 let mut indented_regexp = vec![];
250 let mut nesting_level = 0;
251
252 for (i, line) in regexp.lines().enumerate() {
253 if i == 1 && config.is_start_anchor_disabled {
254 nesting_level += 1;
255 }
256 if line.is_empty() {
257 continue;
258 }
259
260 let is_colored_line = line.starts_with("\u{1b}[");
261
262 if nesting_level > 0
263 && ((is_colored_line && (line.contains('$') || line.contains(')')))
264 || (line == "$" || line.starts_with(')')))
265 {
266 nesting_level -= 1;
267 }
268
269 let indentation = " ".repeat(nesting_level);
270 indented_regexp.push(format!("{indentation}{line}"));
271
272 if (is_colored_line && (line.contains('^') || (i > 0 && line.contains('('))))
273 || (line == "^" || (i > 0 && line.starts_with('(')))
274 {
275 nesting_level += 1;
276 }
277 }
278
279 indented_regexp.join("\n")
280}