easy_regex/
collection.rs

1//! A handful of universal regular expressions.
2//!
3//! This is a collection of the most used regular expressions to reduce making rudimentary mistakes and make the code even more readable.
4//! Except English, there are patterns for five other languages as Persian, French, German, Arabic and Chinese.
5
6/// Should be used inside the **list** method for its full capability.
7pub const ALPHA_NUMERIC: &str = "a-zA-Z0-9";
8/// Should be used inside the **list** method for its full capability.
9pub const UPPER_LOWER_CASE: &str = "a-zA-Z";
10/// Should be used inside the **list** method for its full capability.
11pub const LOWER_CASE: &str = "a-z";
12/// Should be used inside the **list** method for its full capability.
13pub const UPPER_CASE: &str = "A-Z";
14/// Should be used inside the **list** method for its full capability.
15pub const DIGITS: &str = "0-9";
16pub const ANY: &str = ".";
17pub const NULL_CHAR: &str = "\0";
18pub const NEW_LINE: &str = "\n";
19pub const FORM_FEED: &str = "\\f";
20pub const TAB: &str = "\t";
21pub const VERTICAL_TAB: &str = "\\v";
22pub const BACKSPACE: &str = "[\\b]";
23/// Retrieves two capturing groups, one for **username**, the other for **mail server and its domain**.
24/// # Examples
25///
26/// ```
27/// use easy_regex::{collection::*, EasyRegex};
28/// let text = r#"something@email.co.uk"#;
29///
30/// let result = EasyRegex::new(EMAIL);
31/// let captures = result.get_regex().unwrap();
32/// captures.captures_iter(text).for_each(|caps| {
33///     println!("{}, {}", &caps.get(1).unwrap().as_str(), &caps.get(2).unwrap().as_str());
34/// })
35/// // will print: something, email.co.uk
36/// ```
37pub const EMAIL: &str = r"([a-z0-9_+.]*)@([a-z0-9]+(?:[\-\.]{1}[a-z0-9]+)*\.[a-z]{2,6})";
38/// Retrieves **protocol, subdomain, domain name, top level name, directory** and **query params** of a URL on multiple lines.
39/// # Examples
40///
41/// ```
42/// use easy_regex::{collection::*, EasyRegex};
43/// let text = r#"http://www.swimming-pool.co.uk/products/shorts?searchMe=queryMe&name=smith
44/// something@gmail.com
45/// www.seasoning.com
46/// university.gov helloworld.com
47/// https://javaScript.com
48/// "#;
49///
50/// let result = EasyRegex::new(WEBSITE_URL);
51/// let captures = result.get_regex().unwrap();
52/// captures.captures_iter(text).for_each(|caps| {
53///     println!(
54///         "protocol: {}, subdomain: {}, domain name: {}, top level name: {}, directory: {}, query params: {}\n",
55///         // "protocol",
56///         &caps.get(1).map_or("not found", |m| m.as_str()),
57///         // "subdomain",
58///         &caps.get(2).map_or("not found", |m| m.as_str()),
59///         // "domain_name",
60///         &caps.get(3).map_or("not found", |m| m.as_str()),
61///         // "top_level_name",
62///         &caps.get(4).map_or("not found", |m| m.as_str()),
63///         // "directory",
64///         &caps.get(5).map_or("not found", |m| m.as_str()),
65///         // "query_params"
66///         &caps.get(6).map_or("not found", |m| m.as_str()),
67///     );  
68///         // will print:
69///         // protocol: http, subdomain: www, domain name: swimming-pool,
70///         // top level name: .co.uk, directory: /products/shorts,
71///         // query params: searchMe=queryMe&name=smith
72///
73///         // protocol: not found, subdomain: www, domain name: seasoning,
74///         // top level name: .com, directory: not found, query params: not found
75///
76///         // protocol: not found, subdomain: not found, domain name: university,
77///         // top level name: .gov, directory: not found, query params: not found
78///
79///         // protocol: https, subdomain: not found, domain name: javaScript,
80///         // top level name: .com, directory: not found, query params: not found
81/// })
82/// ```
83pub const WEBSITE_URL: &str = r"(?m)(?:(?:(?P<protocol>ftp|https?)://)?(?:(?P<subdomain>www)\.)?)?(?P<domain_name>[-a-zA-Z0-9]{2,253})(?P<top_level_name>(?:\.[a-z]{2,6})+)(?P<directory>(?:/[a-z0-9]+)+)?(?:\?(?P<query_params>[-a-zA-Z0-9@:%_\+~#()&//=]*))?";
84/// Should be used inside the **list** method for its full capability.
85pub const PERSIAN_ALPHABET: &str = r"\u0621-\u0628\u062A-\u063A\u0641-\u0642\u0644-\u0648\u064E-\u0651\u0655\u067E\u0686\u0698\u06A9\u06AF\u06BE\u06CC|\p{arabic}";
86/// Should be used inside the **list** method for its full capability.
87pub const PERSIAN_ARABIC_NUM: &str = r"\u06F0-\u06F9\u0660-\u0669";
88/// Should be used inside the **list** method for its full capability.
89pub const PERSIAN_ALPHA_NUMERIC: &str = r"\u0621-\u0628\u062A-\u063A\u0641-\u0642\u0644-\u0648\u064E-\u0651\u0655\u067E\u0686\u0698\u06A9\u06AF\u06BE\u06CC\u06F0-\u06F9\u0660-\u0669";
90pub const PERSIAN_PUNCTUATION: &str = r"\u060C\u061B\u061F\u0640\u066A\u066B\u066C";
91/// Should be used inside the **list** method for its full capability.
92pub const PERSIAN_SPACES: &str = r"\u0020\u2000-\u200F\u2028-\u202F";
93/// Should be used inside the **list** method for its full capability.
94pub const FRENCH_ALPHABET: &str = r"a-zA-Z\u00C0-\u017F";
95/// Should be used inside the **list** method for its full capability.
96pub const GERMAN_ALPHABET: &str = r"a-zA-Z\u00E4\u00F6\u00FC\u00C4\u00D6\u00DC\u00df";
97/// Should be used inside the **list** method for its full capability.
98pub const CHINESE_ALPHABET: &str = r"\u4e00-\u9fa5";
99/// Captures hour, minute and optional case-insensitive am/pm in 12-hour clock.
100///
101/// # Examples
102/// ```
103/// let text = r#"
104/// 2:50 6:52 06:30 3:8
105/// 7:43 18:59 4:50Pm 5:20 am
106/// "#;
107/// // By using this regex, the output will be:
108/// // Some(Captures({
109/// //    0: Some("2:50"),
110/// //    1: Some("2"),
111/// //    2: Some("50"),
112/// //    3: None
113/// // })),
114/// // Some(Captures({
115/// //    0: Some("6:52"),
116/// //    1: Some("6"),
117/// //    2: Some("52"),
118/// //    3: None
119/// // })),
120/// // Some(Captures({
121/// //    0: Some("06:30"),
122/// //    1: Some("06"),
123/// //    2: Some("30"),
124/// //    3: None
125/// // })),
126/// // Some(Captures({
127/// //    0: Some("3:8"),
128/// //    1: Some("3"),
129/// //    2: Some("8"),
130/// //    3: None
131/// // })),
132/// // Some(Captures({
133/// //    0: Some("7:43"),
134/// //    1: Some("7"),
135/// //    2: Some("43"),
136/// //    3: None
137/// // })),
138//// // Some(Captures({
139/// //    0: Some("4:50"),
140/// //    1: Some("4"),
141/// //    2: Some("50"),
142/// //    3: Some("Pm"),
143/// // })),
144//// // Some(Captures({
145/// //    0: Some("5:20"),
146/// //    1: Some("5"),
147/// //    2: Some("20"),
148/// //    3: Some("am"),
149/// // })),
150/// ```
151pub const TIME_HH_MM_12_AMPM: &str = r"\b(1[0-2]|0?[1-9]):([0-5]?\d)(?: ?((?i)[ap]m))?\b";
152/// Same as ```TIME_HH_MM_12_AMPM``` capturing hour, minute, seconds and optional case-insensitive am/pm.
153pub const TIME_HH_MM_SS_12_AMPM: &str =
154    r"\b(1[0-2]|0?[1-9]):([0-5]?\d):([0-5]?\d)(?: ?((?i)[ap]m))?\b";
155/// Captures hour and minute in 24-hour clock.
156pub const TIME_HH_MM_24: &str = r"\b([01]?\d|2[0-3]):([0-5]?\d)\b";
157/// Same as ```TIME_HH_MM_24``` captures hour, minute as well as seconds.
158pub const TIME_HH_MM_SS_24: &str = r"\b([01]?\d|2[0-3]):([0-5]?\d):([0-5]?\d)\b";
159/// Complete and abbreviated forms of Gregorian months (case sensitive).
160pub const MONTH_NAMES: &str = r"January|Jan\.?|February|Feb\.?|March|Mar\.?|April|Apr\.?|May|June|Jun\.?July|Jul\.?|August|Aug\.?|September|Sep\.?|Sept|October|Oct\.?|November|Nov\.?|December|Dec\.?";
161const _DAY: &str = r"([ 0-2]?[1-9]|[1-2]0|3[01])";
162const _MONTH: &str = r"([ 0]?[1-9]|10|11|12)";
163const _YEAR: &str = r"([1-9]\d{3})";
164fn _date_regex_generator() -> String {
165    format!(
166        r"\b{}/{}/{}\b|\b{}/{}/{}\b|\b{}/{}/{}\b|\b{}{}{}\b|\b{}{}{}\b|\b{}{}{}\b|\b{}-{}-{}\b|\b{}-{}-{}\b|\b{}-{}-{}\b|\b({})(?:,| |, )?{}(?:,| |, )?{}\b|\b{}(?:,| |, )?({})(?:,| |, )?{}\b|\b{}(?:,| |, )?({})(?:,| |, )?{}\b",
167        _MONTH,
168        _DAY,
169        _YEAR,
170        _DAY,
171        _MONTH,
172        _YEAR,
173        _YEAR,
174        _MONTH,
175        _DAY,
176        _MONTH,
177        _DAY,
178        _YEAR,
179        _DAY,
180        _MONTH,
181        _YEAR,
182        _YEAR,
183        _MONTH,
184        _DAY,
185        _MONTH,
186        _DAY,
187        _YEAR,
188        _DAY,
189        _MONTH,
190        _YEAR,
191        _YEAR,
192        _MONTH,
193        _DAY,
194        MONTH_NAMES,
195        _DAY,
196        _YEAR,
197        _DAY,
198        MONTH_NAMES,
199        _YEAR,
200        _YEAR,
201        MONTH_NAMES,
202        _DAY,
203    )
204}
205/// Captures day, month and year in all valid formats.
206/// ```text
207/// "10/25/2025", "25-10-2025", "Feb, 15 2020", etc.
208/// ```
209pub const DATE: &str = r"\b([ 0]?[1-9]|10|11|12)/([ 0-2]?[1-9]|[1-2]0|3[01])/([1-9]\d{3})\b|\b([ 0-2]?[1-9]|[1-2]0|3[01])/([ 0]?[1-9]|10|11|12)/([1-9]\d{3})\b|\b([1-9]\d{3})/([ 0]?[1-9]|10|11|12)/([ 0-2]?[1-9]|[1-2]0|3[01])\b|\b([ 0]?[1-9]|10|11|12)([ 0-2]?[1-9]|[1-2]0|3[01])([1-9]\d{3})\b|\b([ 0-2]?[1-9]|[1-2]0|3[01])([ 0]?[1-9]|10|11|12)([1-9]\d{3})\b|\b([1-9]\d{3})([ 0]?[1-9]|10|11|12)([ 0-2]?[1-9]|[1-2]0|3[01])\b|\b([ 0]?[1-9]|10|11|12)-([ 0-2]?[1-9]|[1-2]0|3[01])-([1-9]\d{3})\b|\b([ 0-2]?[1-9]|[1-2]0|3[01])-([ 0]?[1-9]|10|11|12)-([1-9]\d{3})\b|\b([1-9]\d{3})-([ 0]?[1-9]|10|11|12)-([ 0-2]?[1-9]|[1-2]0|3[01])\b|\b(January|Jan\.?|February|Feb\.?|March|Mar\.?|April|Apr\.?|May|June|Jun\.?July|Jul\.?|August|Aug\.?|September|Sep\.?|Sept|October|Oct\.?|November|Nov\.?|December|Dec\.?)(?:,| |, )?([ 0-2]?[1-9]|[1-2]0|3[01])(?:,| |, )?([1-9]\d{3})\b|\b([ 0-2]?[1-9]|[1-2]0|3[01])(?:,| |, )?(January|Jan\.?|February|Feb\.?|March|Mar\.?|April|Apr\.?|May|June|Jun\.?July|Jul\.?|August|Aug\.?|September|Sep\.?|Sept|October|Oct\.?|November|Nov\.?|December|Dec\.?)(?:,| |, )?([1-9]\d{3})\b|\b([1-9]\d{3})(?:,| |, )?(January|Jan\.?|February|Feb\.?|March|Mar\.?|April|Apr\.?|May|June|Jun\.?July|Jul\.?|August|Aug\.?|September|Sep\.?|Sept|October|Oct\.?|November|Nov\.?|December|Dec\.?)(?:,| |, )?([ 0-2]?[1-9]|[1-2]0|3[01])\b";
210pub const IPV4: &str = r"\b(?:(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\.){3}(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\b";
211lazy_static! {
212    pub static ref IPV6: &'static str = r"\b(?:(?:[0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,7}:|(?:[0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,5}(?::[0-9a-fA-F]{1,4}){1,2}|(?:[0-9a-fA-F]{1,4}:){1,4}(?::[0-9a-fA-F]{1,4}){1,3}|(?:[0-9a-fA-F]{1,4}:){1,3}(?::[0-9a-fA-F]{1,4}){1,4}|(?:[0-9a-fA-F]{1,4}:){1,2}(?::[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:(?:(?::[0-9a-fA-F]{1,4}){1,6})|:(?:(?::[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(?::[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(?:ffff(?::0{1,4}){0,1}:){0,1}(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])|(?:[0-9a-fA-F]{1,4}:){1,4}:(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9]))\b";
213    pub static ref IPV4_6: &'static str = r"\b(?:(?:(?:(?:(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\.){3}(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])))|(?:(?:(?:(?:[0-9A-Fa-f]{1,4}:){7}(?:[0-9A-Fa-f]{1,4}|:))|(?:(?:[0-9A-Fa-f]{1,4}:){6}(?::[0-9A-Fa-f]{1,4}|(?:(?:25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(?:\.(?:25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3})|:))|(?:(?:[0-9A-Fa-f]{1,4}:){5}(?:(?:(?::[0-9A-Fa-f]{1,4}){1,2})|:(?:(?:25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(?:\.(?:25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3})|:))|(?:(?:[0-9A-Fa-f]{1,4}:){4}(?:(?:(?::[0-9A-Fa-f]{1,4}){1,3})|(?:(?::[0-9A-Fa-f]{1,4})?:(?:(?:25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(?:\.(?:25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(?:(?:[0-9A-Fa-f]{1,4}:){3}(?:(?:(?::[0-9A-Fa-f]{1,4}){1,4})|(?:(?::[0-9A-Fa-f]{1,4}){0,2}:(?:(?:25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(?:\.(?:25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(?:(?:[0-9A-Fa-f]{1,4}:){2}(?:(?:(?::[0-9A-Fa-f]{1,4}){1,5})|(?:(?::[0-9A-Fa-f]{1,4}){0,3}:(?:(?:25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(?:\.(?:25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(?:(?:[0-9A-Fa-f]{1,4}:){1}(?:(?:(?::[0-9A-Fa-f]{1,4}){1,6})|(?:(?::[0-9A-Fa-f]{1,4}){0,4}:(?:(?:25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(?:\.(?:25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(?::(?:(?:(?::[0-9A-Fa-f]{1,4}){1,7})|(?:(?::[0-9A-Fa-f]{1,4}){0,5}:(?:(?:25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(?:\.(?:25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:)))(?:%.+)?))\b";
214}
215
216#[cfg(test)]
217mod tests {
218    use crate::{
219        collection::*,
220        settings::{base::*, group::*},
221        EasyRegex,
222    };
223
224    #[test]
225    fn persian_words_regex_works() {
226        let result = EasyRegex::new_section().list(&PERSIAN_ALPHA_NUMERIC, &ONE_OR_MORE);
227
228        let text = "سلام شماره من ۱۲۳۶ است";
229        let is_match = result.clone().get_regex().unwrap().find_iter(text).count();
230        result
231            .get_regex()
232            .unwrap()
233            .find_iter(text)
234            .into_iter()
235            .for_each(|found| {
236                println!("{}", found.as_str());
237            });
238
239        assert_eq!(5, is_match);
240    }
241
242    #[test]
243    fn french_words_regex_works() {
244        let text = "Adélaïde Aurélie Gaëlle";
245        let result = EasyRegex::new_section().list(&FRENCH_ALPHABET, &ONE_OR_MORE);
246
247        let count = result.get_regex().unwrap().captures_iter(text).count();
248        assert_eq!(3, count);
249    }
250
251    #[test]
252    fn german_words_regex_works() {
253        let text = "Müller Sönke Käthe";
254        let result = EasyRegex::new_section().list(&GERMAN_ALPHABET, &ONE_OR_MORE);
255
256        let count = result.get_regex().unwrap().captures_iter(text).count();
257        assert_eq!(3, count);
258    }
259
260    #[test]
261    fn chinese_words_regex_works() {
262        let text = "正则表达式";
263        let result = EasyRegex::new_section().list(&CHINESE_ALPHABET, &ONE_OR_MORE);
264
265        let is_match = result.get_regex().unwrap().is_match(text);
266        assert_eq!(true, is_match);
267    }
268
269    #[test]
270    fn website_url_works() {
271        let text = r#"http://www.swimming-pool.co.uk/products/shorts?searchMe=queryMe&name=smith
272            something@gmail.com
273            www.seasoning.com
274            university.gov helloworld.com
275            https://javaScript.com
276        "#;
277
278        let result = EasyRegex::new(WEBSITE_URL);
279        let captures = result.get_regex().unwrap();
280        captures.captures_iter(text).for_each(|caps| {
281            println!(
282                "protocol: {}, subdomain: {}, domain name: {}, top level name: {}, directory: {}, query params: {}\n",
283                &caps.get(1).map_or("not found", |m| m.as_str()), // "protocol",
284                &caps.get(2).map_or("not found", |m| m.as_str()), // "subdomain",
285                &caps.get(3).map_or("not found", |m| m.as_str()), // "domain_name",
286                &caps.get(4).map_or("not found", |m| m.as_str()), // "top_level_name",
287                &caps.get(5).map_or("not found", |m| m.as_str()), // "directory",
288                &caps.get(6).map_or("not found", |m| m.as_str()), // "query_params"
289            );
290        })
291    }
292
293    #[test]
294    fn time_works() {
295        let text = "7:4 5:20 6:30am 02:2 01:30";
296        let result = EasyRegex::new(TIME_HH_MM_12_AMPM);
297        result
298            .clone()
299            .get_regex()
300            .unwrap()
301            .captures_iter(text)
302            .for_each(|f| {
303                println!("{:?}", f);
304            });
305        let count = result.get_regex().unwrap().captures_iter(text).count();
306        assert_eq!(5, count);
307    }
308
309    #[test]
310    fn date_and_time_works() {
311        let text = r#"
312            Feb 17 2009 5:3am 03/26/1994 8:41 23/7/2030 9:20Pm
313            12 Sept 2015 6:14 03-26-1994 2:18 2030/4/27 3:50
314        "#;
315        let result = EasyRegex::new_section()
316            .group(DATE, &DEFAULT_GROUP) // will capture any valid format of a date.
317            .literal_space()
318            .group(TIME_HH_MM_24, &DEFAULT_GROUP); // will capture hours and minutes in 24-hour clock.
319        result
320            .clone()
321            .get_regex()
322            .unwrap()
323            .captures_iter(text)
324            .for_each(|captures| println!("{}", captures.get(0).unwrap().as_str()));
325        //
326
327        let matched_patterns_count = result.get_regex().unwrap().captures_iter(text).count();
328        assert_eq!(4, matched_patterns_count);
329        // let text = r#"
330        // Feb 17 2009 5:3 am 23/7/2030 9:20Pm
331        // 12 Sept 2015 6:14
332        // "#;
333        // let result = EasyRegex::new_section()
334        //     .group(DATE, &DEFAULT_GROUP)
335        //     .space()
336        //     .group(TIME_HH_MM_12_AMPM, &DEFAULT_GROUP);
337        // result
338        //     .clone()
339        //     .get_regex()
340        //     .unwrap()
341        //     .captures_iter(text)
342        //     .for_each(|captures| println!("{}", captures.get(0).unwrap().as_str()));
343
344        //     let count = result.get_regex().unwrap().captures_iter(text).count();
345        // assert_eq!(3, count);
346    }
347
348    #[test]
349    fn ip_works() {
350        let text =
351            "2001:0db8:85a3:0000:0000:8a2e:0370:7334 5002:0db8:85a3:0000:0000:8a2e:0560:7334";
352        let result = EasyRegex::new(&IPV6);
353        result
354            .clone()
355            .get_regex()
356            .unwrap()
357            .captures_iter(text)
358            .for_each(|f| {
359                println!("{:?}", f);
360            });
361        let count = result.get_regex().unwrap().captures_iter(text).count();
362        assert_eq!(2, count);
363    }
364
365    #[test]
366    fn date_gen_output() {
367        println!("{}", _date_regex_generator());
368    }
369}