grex/
builder.rs

1/*
2 * Copyright © 2019-today Peter M. Stahl pemistahl@gmail.com
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17use crate::config::RegExpConfig;
18use crate::regexp::RegExp;
19use itertools::Itertools;
20use std::io::ErrorKind;
21use std::path::PathBuf;
22
23pub(crate) const MISSING_TEST_CASES_MESSAGE: &str =
24    "No test cases have been provided for regular expression generation";
25
26pub(crate) const MINIMUM_REPETITIONS_MESSAGE: &str =
27    "Quantity of minimum repetitions must be greater than zero";
28
29pub(crate) const MINIMUM_SUBSTRING_LENGTH_MESSAGE: &str =
30    "Minimum substring length must be greater than zero";
31
32/// This struct builds regular expressions from user-provided test cases.
33#[derive(Clone)]
34#[cfg_attr(feature = "python", pyo3::prelude::pyclass)]
35pub struct RegExpBuilder {
36    pub(crate) test_cases: Vec<String>,
37    pub(crate) config: RegExpConfig,
38}
39
40impl RegExpBuilder {
41    /// Specifies the test cases to build the regular expression from.
42    ///
43    /// The test cases need not be sorted because `RegExpBuilder` sorts them internally.
44    ///
45    /// ⚠ Panics if `test_cases` is empty.
46    pub fn from<T: Clone + Into<String>>(test_cases: &[T]) -> Self {
47        if test_cases.is_empty() {
48            panic!("{}", MISSING_TEST_CASES_MESSAGE);
49        }
50        Self {
51            test_cases: test_cases.iter().cloned().map(|it| it.into()).collect_vec(),
52            config: RegExpConfig::new(),
53        }
54    }
55
56    /// Specifies a text file containing test cases to build the regular expression from.
57    ///
58    /// The test cases need not be sorted because `RegExpBuilder` sorts them internally.
59    ///
60    /// Each test case needs to be on a separate line.
61    /// Lines may be ended with either a newline (`\n`) or
62    /// a carriage return with a line feed (`\r\n`).
63    /// The final line ending is optional.
64    ///
65    /// ⚠ Panics if:
66    /// - the file cannot be found
67    /// - the file's encoding is not valid UTF-8 data
68    /// - the file cannot be opened because of conflicting permissions
69    pub fn from_file<T: Into<PathBuf>>(file_path: T) -> Self {
70        match std::fs::read_to_string(file_path.into()) {
71            Ok(file_content) => Self {
72                test_cases: file_content.lines().map(|it| it.to_string()).collect_vec(),
73                config: RegExpConfig::new(),
74            },
75            Err(error) => match error.kind() {
76                ErrorKind::NotFound => panic!("The specified file could not be found"),
77                ErrorKind::InvalidData => {
78                    panic!("The specified file's encoding is not valid UTF-8")
79                }
80                ErrorKind::PermissionDenied => {
81                    panic!("Permission denied: The specified file could not be opened")
82                }
83                _ => panic!("{}", error),
84            },
85        }
86    }
87
88    /// Converts any Unicode decimal digit to character class `\d`.
89    ///
90    /// This method takes precedence over
91    /// [`with_conversion_of_words`](Self::with_conversion_of_words) if both are set.
92    /// Decimal digits are converted to `\d`, the remaining word characters to `\w`.
93    ///
94    /// This method takes precedence over
95    /// [`with_conversion_of_non_whitespace`](Self::with_conversion_of_non_whitespace) if both are set.
96    /// Decimal digits are converted to `\d`, the remaining non-whitespace characters to `\S`.
97    pub fn with_conversion_of_digits(&mut self) -> &mut Self {
98        self.config.is_digit_converted = true;
99        self
100    }
101
102    /// Converts any character which is not a Unicode decimal digit to character class `\D`.
103    ///
104    /// This method takes precedence over
105    /// [`with_conversion_of_non_words`](Self::with_conversion_of_non_words) if both are set.
106    /// Non-digits which are also non-word characters are converted to `\D`.
107    ///
108    /// This method takes precedence over
109    /// [`with_conversion_of_non_whitespace`](Self::with_conversion_of_non_whitespace) if both are set.
110    /// Non-digits which are also non-space characters are converted to `\D`.
111    pub fn with_conversion_of_non_digits(&mut self) -> &mut Self {
112        self.config.is_non_digit_converted = true;
113        self
114    }
115
116    /// Converts any Unicode whitespace character to character class `\s`.
117    ///
118    /// This method takes precedence over
119    /// [`with_conversion_of_non_digits`](Self::with_conversion_of_non_digits) if both are set.
120    /// Whitespace characters are converted to `\s`, the remaining non-digit characters to `\D`.
121    ///
122    /// This method takes precedence over
123    /// [`with_conversion_of_non_words`](Self::with_conversion_of_non_words) if both are set.
124    /// Whitespace characters are converted to `\s`, the remaining non-word characters to `\W`.
125    pub fn with_conversion_of_whitespace(&mut self) -> &mut Self {
126        self.config.is_space_converted = true;
127        self
128    }
129
130    /// Converts any character which is not a Unicode whitespace character to character class `\S`.
131    pub fn with_conversion_of_non_whitespace(&mut self) -> &mut Self {
132        self.config.is_non_space_converted = true;
133        self
134    }
135
136    /// Converts any Unicode word character to character class `\w`.
137    ///
138    /// This method takes precedence over
139    /// [`with_conversion_of_non_digits`](Self::with_conversion_of_non_digits) if both are set.
140    /// Word characters are converted to `\w`, the remaining non-digit characters to `\D`.
141    ///
142    /// This method takes precedence over
143    /// [`with_conversion_of_non_whitespace`](Self::with_conversion_of_non_whitespace) if both are set.
144    /// Word characters are converted to `\w`, the remaining non-space characters to `\S`.
145    pub fn with_conversion_of_words(&mut self) -> &mut Self {
146        self.config.is_word_converted = true;
147        self
148    }
149
150    /// Converts any character which is not a Unicode word character to character class `\W`.
151    ///
152    /// This method takes precedence over
153    /// [`with_conversion_of_non_whitespace`](Self::with_conversion_of_non_whitespace) if both are set.
154    /// Non-words which are also non-space characters are converted to `\W`.
155    pub fn with_conversion_of_non_words(&mut self) -> &mut Self {
156        self.config.is_non_word_converted = true;
157        self
158    }
159
160    /// Detects repeated non-overlapping substrings and
161    /// to convert them to `{min,max}` quantifier notation.
162    pub fn with_conversion_of_repetitions(&mut self) -> &mut Self {
163        self.config.is_repetition_converted = true;
164        self
165    }
166
167    /// Enables case-insensitive matching of test cases
168    /// so that letters match both upper and lower case.
169    pub fn with_case_insensitive_matching(&mut self) -> &mut Self {
170        self.config.is_case_insensitive_matching = true;
171        self
172    }
173
174    /// Replaces non-capturing groups with capturing ones.
175    pub fn with_capturing_groups(&mut self) -> &mut Self {
176        self.config.is_capturing_group_enabled = true;
177        self
178    }
179
180    /// Specifies the minimum quantity of substring repetitions to be converted if
181    /// [`with_conversion_of_repetitions`](Self::with_conversion_of_repetitions) is set.
182    ///
183    /// If the quantity is not explicitly set with this method, a default value of 1 will be used.
184    ///
185    /// ⚠ Panics if `quantity` is zero.
186    pub fn with_minimum_repetitions(&mut self, quantity: u32) -> &mut Self {
187        if quantity == 0 {
188            panic!("{}", MINIMUM_REPETITIONS_MESSAGE);
189        }
190        self.config.minimum_repetitions = quantity;
191        self
192    }
193
194    /// Specifies the minimum length a repeated substring must have in order to be converted if
195    /// [`with_conversion_of_repetitions`](Self::with_conversion_of_repetitions) is set.
196    ///
197    /// If the length is not explicitly set with this method, a default value of 1 will be used.
198    ///
199    /// ⚠ Panics if `length` is zero.
200    pub fn with_minimum_substring_length(&mut self, length: u32) -> &mut Self {
201        if length == 0 {
202            panic!("{}", MINIMUM_SUBSTRING_LENGTH_MESSAGE);
203        }
204        self.config.minimum_substring_length = length;
205        self
206    }
207
208    /// Converts non-ASCII characters to unicode escape sequences.
209    /// The parameter `use_surrogate_pairs` specifies whether to convert astral code planes
210    /// (range `U+010000` to `U+10FFFF`) to surrogate pairs.
211    pub fn with_escaping_of_non_ascii_chars(&mut self, use_surrogate_pairs: bool) -> &mut Self {
212        self.config.is_non_ascii_char_escaped = true;
213        self.config.is_astral_code_point_converted_to_surrogate = use_surrogate_pairs;
214        self
215    }
216
217    /// Produces a nicer looking regular expression in verbose mode.
218    pub fn with_verbose_mode(&mut self) -> &mut Self {
219        self.config.is_verbose_mode_enabled = true;
220        self
221    }
222
223    /// Removes the caret anchor '^' from the resulting regular
224    /// expression, thereby allowing to match the test cases also when they do not occur
225    /// at the start of a string.
226    pub fn without_start_anchor(&mut self) -> &mut Self {
227        self.config.is_start_anchor_disabled = true;
228        self
229    }
230
231    /// Removes the dollar sign anchor '$' from the resulting regular
232    /// expression, thereby allowing to match the test cases also when they do not occur
233    /// at the end of a string.
234    pub fn without_end_anchor(&mut self) -> &mut Self {
235        self.config.is_end_anchor_disabled = true;
236        self
237    }
238
239    /// Removes the caret and dollar sign anchors from the resulting
240    /// regular expression, thereby allowing to match the test cases also when they occur
241    /// within a larger string that contains other content as well.
242    pub fn without_anchors(&mut self) -> &mut Self {
243        self.config.is_start_anchor_disabled = true;
244        self.config.is_end_anchor_disabled = true;
245        self
246    }
247
248    /// Provides syntax highlighting for the resulting regular expression.
249    ///
250    /// ⚠ This method may only be used if the resulting regular expression is meant to
251    /// be printed to the console. The regex string representation returned from enabling
252    /// this setting cannot be fed into the [*regex*](https://crates.io/crates/regex) crate.
253    #[cfg(feature = "cli")]
254    #[doc(hidden)]
255    pub fn with_syntax_highlighting(&mut self) -> &mut Self {
256        self.config.is_output_colorized = true;
257        self
258    }
259
260    /// Builds the actual regular expression using the previously given settings.
261    pub fn build(&mut self) -> String {
262        RegExp::from(&mut self.test_cases, &self.config).to_string()
263    }
264}