regex_literal/lib.rs
1/*!
2regex-literal - regex literal enclosed by delimiters
3===============================================================================
4This crate provides a quick approach of creating regular expression [`Regex`]
5and sequence [`ReSequence`] from delimited literals at runtime. Its aim is to
6formalize regex literal in Rust computing.
7
8## Background
9In Rust Reference[^1], primitive types (boolean, numeric and textual) have own
10literal expressions that are evaluated as single tokens in source code at
11compile time. But it is not the case for regular expression (abbr. regex).
12
13In many scripting languages that implement PCRE library[^2], a regex pattern
14is enclosed by a pair of delimiters, for example,`/pattern/im` in JavaScript.
15Regex engines in [Rust crate regex-automata](https://crates.io/crates/regex-automata),
16can only receive a general literal (&str) in building a one-pattern regex.
17In the interface of [Regex::new_many](https://docs.rs/regex-automata/latest/regex_automata/meta/struct.Regex.html#method.new_many),
18an array of many pattern strings is required, as there is no syntax
19for one string literal representing a compound regex.
20
21## Features
22The crate delivers literal formats for regex and regex sets with the
23following punctuations:
24
25* `//` (a pair of forward slashes) as the default delimiters that enclose a
26pattern.
27
28* `[]` (a pair of square brackets) that hold a union of multiple patterns
29(abbr. as 'ReU').
30
31* `<>` (a pair of angle brackets) that hold a sequence of regex patterns and/or
32 pattern unions (abbr. as 'ReS') that iterates over consecutive matchings.
33
34* `,` (comma) serves as seperator in between regex pattern literals, while any
35whitespace unicode character[^3] is skipped in parsing.
36
37### Samples of regex literals
381. a simple pattern : `r#"/ab+c/"#`
392. a regex union literal: `r#"[/(?i)ab+c/,/(?u)\s{2}D+/]"#`
403. a regex sequence literal: `r#"</(?i)ab+c/,/(?u)\s{2}D+/>"#`
414. another regex sequence literal: `r#"<[/(?i)ab+c/,/(?u)\s{2}D+/],/\s*\w+/>"#`
42
43Note that [`crate::delimited::set_delimiter()`] allows choosing a customized
44delimiter from [`crate::delimited::DELIMITER_CHARS`](static@DELIMITER_CHARS).
45In addition, [`crate::util`] module provides public functions of text
46convertion between undelimited and delimited patterns.
47
48## Building Regex structs from regex-literal
49The regular expression structs can be constructed via either
50[`crate::XRegex::try_from`],[`crate::XRegex::from_str`] or
51[`crate::XRegex::new`]. The former two use the default regex literal delimiter
52("/" transcoded in [`crate::delimited::DELIMITER`]); the latter allows a
53customised delimiter. An easy alternative is to use macro
54`xregex!` [`crate::xregex`] when constructing XRegex with literals.
55
56### Examples
57
58```rust
59use regex_literal::{XRegex,FromStr,Regex,Match,PatternID,Input,Anchored,xregex};
60
61//example 0: create a XRegex structs from a one-pattern literal by xregex!()
62let text = "abc123";
63//construct XRegex
64let xre = xregex!(r"/^[a-z]+\d{3}$/");
65// equivalent to the following variances - (1) XRegex::try_from(br"/^[a-z]+\d{3}$/").unwrap() (2) XRegex::from_str(r"/^[a-z]+\d{3}$/").unwrap() (3) XRegex::new(r"/^[a-z]+\d{3}$/",b"/").unwrap()
66//get regex reference from XRegex struct
67let re = xre.as_regex().unwrap();
68//check if the one pattern regex matches with the target text
69assert!(re.is_match(text));
70
71//example 1: create a XRegex struct from a one-pattern literal
72let text0 = "abc123";
73//create one-pattern literal
74let re0 = r#"/^[a-z]+\d{3}$/"#;
75//construct XRegex
76let x = re0.parse::<XRegex>().unwrap();//let x = XRegex::from_str(re0).unwrap();
77//get Regex from XRegex struct
78let x_one_pattern = x.as_regex().unwrap();
79//check if the one pattern regex matches with the target text
80assert!(x_one_pattern.is_match(text0));
81//find the first match if it exists
82let m = x_one_pattern.find(text0);
83assert_eq!(m,Some(Match::must(0,0..6)));
84
85//example 2: create a XRegex struct from a one-pattern literal
86let text1 = "ABBBC abc123";
87let re1 = "!!!!(?i)ab+c!!!!";
88//construct XRegex
89let y = XRegex::new(re1,b"!!!!").unwrap();
90//get Regex from XRegex struct
91let y_one_pattern = y.as_regex().unwrap();
92// check if this one pattern regex matches with the input
93assert!(y_one_pattern.is_match(text1));
94//find all non-overlapping leftmost matches
95let matches:Vec<Match> = y_one_pattern.find_iter(text1).collect();
96assert_eq!(matches,vec![Match::must(0,0..5),Match::must(0,6..9),]);
97
98//example 3: create a XRegex struct from a multiple-pattern literal
99let reu = r"[/(?i)ab+c/,/\w+/]";
100let mut m1 = XRegex::from_str(reu).unwrap();
101//get Regex from XRegex struct
102let m_patterns = m1.get_regex().unwrap();
103assert!(m_patterns.is_match(text1));
104let m_matches:Vec<Match> = m_patterns.find_iter(text1).collect();
105assert_eq!(m_matches,vec![Match::must(0,0..5),Match::must(0,6..9),Match::must(1,9..12)]); //non-overlapping leftmost matches
106
107let expected = Some(Match::must(1,0..7));
108let input = Input::new("23ABBBC abc&").anchored(Anchored::Pattern(PatternID::must(1)));//choose the specific pattern for input
109let n_patterns = XRegex::from_str(reu).unwrap().get_regex().unwrap();
110let mut caps = n_patterns.create_captures();
111n_patterns.search_captures(&input,&mut caps);
112assert_eq!(expected, caps.get_match());
113
114//example 4: create a XRegex struct from a regex sequence literal
115let rel = br#"</(?i)ab+c/,/^\w+?\d+$/>"#;
116let xre2= XRegex::try_from(&rel[..]).unwrap();
117let seq_slice = xre2.as_slice().unwrap();
118let child_regex = &seq_slice[1];
119assert!(child_regex.is_match("abc333"));
120
121```
122
123## Conversion of regex literals
1241. [`crate::util::delimit`] and [`crate::util::undelimit`] provide regex
125literal conversion between undelimited and delimited forms.
126
127### Examples
128
129``` rust
130# use regex_literal::util::{delimit,undelimit};
131
132let delimiter = "/";
133// a regex literal that includes delimiter(forward slash `/`)
134let re1 = r"\d{2}/\d{2}/\d{4}";
135let delimited1 = delimit(re1,delimiter);
136let string1 = r"/\d{2}\/\d{2}\/\d{4}/";
137assert_eq!(&delimited1[..],string1);
138
139let undelimited = undelimit(&delimited1[..],delimiter).unwrap();
140assert_eq!(&undelimited[..], re1);
141
142```
143
1442. [`crate::assembly::into_reu`] and [`crate::assembly::into_res`] annotate
145patterns with default delimiters into delimited literals of regular expression
146union and sequence accordingly. Note the transformations require feature "w".
147
148### Examples
149
150``` rust
151# use regex_literal::assembly::into_reu;
152let re1 = "(?i)ab+c";
153let re2 = r"\w+";
154let re_set = [re1,re2];
155let reu = into_reu(&re_set);
156assert_eq!(reu,r"[/(?i)ab+c/,/\w+/]".to_owned());
157
158```
159
160
161## Acknowledgements
162[`regex-literal`] has adopted PCRE-style delimiters on top of regex engines in Rust crate regex-automata.
163
164[^1]: [literal expressions](https://doc.rust-lang.org/reference/expressions/literal-expr.html)
165
166[^2]: [PCRE flavor](https://pcre.org/original/doc/html/pcretest.html)
167
168[^3]: [Unicode characters with property White_Space=yes](https://en.wikipedia.org/wiki/Whitespace_character#Unicode)
169
170---
171
172
173
174*/
175
176//includes the changlog markdown file for documenting everything into one
177
178#![doc = include_str!("../CHANGELOG.md")]
179
180#![allow(unused)]
181extern crate alloc;
182mod error;
183pub mod util;
184
185pub mod delimited;
186pub use alloc::str::FromStr;//trait implemented in XRegex
187pub use delimited::*;
188
189/// construct XRegex with the following arguments: $l - regex string literal, $d_bytes - the byte string literal of delimiter
190#[macro_export]
191macro_rules! xregex {
192 ($l:literal) => {
193 $l.parse::<$crate::XRegex>().unwrap() //XRegex::from_str(literal).unwrap()
194 };
195 ($l:literal,$d_bytes:literal) => {
196 $crate::XRegex::new($l,$d_bytes).unwrap()
197 };
198}
199
200//https://doc.rust-lang.org/cargo/reference/features.html
201///feature `w` - "wiring"
202#[cfg(any(test, feature = "w"))]
203pub mod assembly;
204/// feature `c` - "correlation"
205#[cfg(any(test,feature = "c"))]
206pub mod correlation;
207
208//the #[cfg(test)] annotation on the tests module tells Rust to compile and run the test code only when you run cargo test, not when you run cargo build
209#[cfg(test)]
210mod tests {
211 use super::*;
212 #[test]
213 fn regex_from_single_pattern_literal() {
214 let re_str = "/(?i)ab+c/";
215 let mut re = XRegex::from_str(re_str).unwrap();
216 //test literal field
217 assert_eq!(re.literal.0 as u8, b'/');
218 assert_eq!(re.literal.1, "/(?i)ab+c/".to_owned());
219 let my_re = re.get_regex().unwrap();
220 assert!(my_re.is_match("ABBBC"));
221 }
222 #[test]
223 fn regex_from_pattern_union_str() {
224 let reunion_str = "[/(?i)ab+c/,/(?u)\\s{2}D+/]";
225 let mut re = XRegex::from_str(reunion_str).unwrap();
226 let my_re_set = re.get_regex().unwrap();
227 assert!(my_re_set.is_match("ABBBC"));
228 assert!(my_re_set.is_match(" DD"));
229 }
230
231
232 #[test]
233 fn resequence_from_literal() {
234 //let seq_str = r"</(?u)[\u4e00-\u9fa5]{4}/,/\d{3}/>";
235 let seq_str = r"</\s+/,/(\p{script=Han}+)(\d{3})/>";
236 let mut xregex = XRegex::from_str(seq_str).unwrap();
237 let input = " 天下一家1234";
238 assert_eq!(xregex.data.matching_indices(input).unwrap(),vec![0,1]);
239
240 let seq = xregex.as_slice().unwrap();
241 assert_eq!(seq.len(),2);
242
243
244 let regex_ws = &seq[0];
245 let m1 = regex_ws.find(input);
246 assert_eq!(m1,Some(Match::must(0,0..2)));
247 let regex_words = &seq[1];
248 //assert!(regex_words.is_match(input));
249 let mut caps = regex_words.create_captures();
250 regex_words.captures(input,&mut caps);
251 let chinese_span = caps.get_group(1).unwrap();
252
253 assert_eq!("天下一家",&input[chinese_span.start..chinese_span.end]);
254
255
256 /*
257 let mut captures = vec![];
258 for (full,[chinese,digits]) in regex_words.captures_iter(&input).map(|c| c.extract()) {
259 captures.push((chinese, digits.parse::<u16>()?));
260 }
261 //see Captures from https://docs.rs/regex-automata/latest/regex_automata/util/captures/struct.Captures.html#method.extract
262
263 assert_eq!(captures,vec![("天下一家",1234)]);
264 */
265
266 }
267
268 #[test]
269 fn test_macro_xregex(){
270 let mut x = xregex!("/(?i)ab+c/",b"/");
271 let my_re = x.get_regex().unwrap();
272 assert!(my_re.is_match("ABBBC"));
273 }
274
275}
276
277
278//this test succeed when run: cargo build --features "w" && cargo test ; https://doc.rust-lang.org/cargo/reference/features.html#command-line-feature-options
279//reference: https://doc.rust-lang.org/cargo/commands/cargo-build.html
280#[cfg(test)]
281mod tests_on_w_feature{
282 use super::*;
283 use assembly::{assemble_reu,assemble_res,into_reu,into_res};
284 #[test]
285 fn test_assemble_reu(){
286 let reu = r"[/(?i)ab+c/,/\w+/]";
287 let m1 = XRegex::from_str(reu).unwrap();
288
289 let re1 = "/(?i)ab+c/";
290 let re2 = r"/\w+/";
291 let arr_re = [re1,re2];
292 let assembled = assemble_reu(&arr_re).unwrap();
293 assert_eq!(reu.to_owned(),assembled);
294 let m2 = XRegex::from_str(&assembled).unwrap();
295 assert_eq!(m1,m2);
296 }
297
298 #[test]
299 fn test_assemble_res(){
300 let res = r"</(?i)ab+c/,/\w+/>";
301 let m1 = XRegex::from_str(res).unwrap();
302
303 let re1 = "/(?i)ab+c/";
304 let re2 = r"/\w+/";
305 let arr_re = [re1,re2];
306 let assembled = assemble_res(&arr_re).unwrap();
307 assert_eq!(res.to_owned(),assembled);
308 let m2 = XRegex::from_str(&assembled).unwrap();
309 assert_eq!(m1,m2);
310 }
311
312 #[test]
313 fn test_into_reu(){
314 let re1 = "(?i)ab+c";
315 let re2 = r"\w+";
316 let re_set = [re1,re2];
317 let reu = into_reu(&re_set);
318 assert_eq!(reu,r"[/(?i)ab+c/,/\w+/]".to_owned());
319
320 }
321 #[test]
322 fn test_into_res(){
323 let re1 = "(?i)ab+c";
324 let re2 = r"\w+";
325 let re_set = [re1,re2];
326 let res = into_res(&re_set);
327 assert_eq!(res,r"</(?i)ab+c/,/\w+/>".to_owned());
328
329 }
330
331}
332
333#[cfg(test)]
334mod tests_on_c_feature{
335 use super::*;
336 use correlation::par_matching_indices;
337
338 #[test]
339 fn test_parallel_matching(){
340 let seq_str = r"</\s+/,/(\p{script=Han}+)(\d{3})/>";
341 let mut xregex = XRegex::from_str(seq_str).unwrap();
342 let input = " 天下一家1234";
343 let seq = xregex.as_slice().unwrap();
344 assert_eq!(par_matching_indices(seq,input).unwrap(),vec![0,1]);
345 }
346
347}
348