regex_literal/
lib.rs

1/*!
2regex-literal - regex literal enclosed by delimiters
3===============================================================================
4This crate provides a quick approach of creating regular expression [`Regex`] 
5and sequence [`ReSequence`] from delimited literals at runtime. Its aim is to 
6formalize regex literal in Rust computing.
7
8## Background
9In Rust Reference[^1], primitive types (boolean, numeric and textual) have own 
10literal expressions that are evaluated as single tokens in source code at 
11compile time. But it is not the case for regular expression (abbr. regex).
12
13In many scripting languages that implement PCRE library[^2], a regex pattern 
14is enclosed by a pair of delimiters, for example,`/pattern/im` in JavaScript. 
15Regex engines in [Rust crate regex-automata](https://crates.io/crates/regex-automata), 
16can only receive a general literal (&str) in building a one-pattern regex. 
17In the interface of [Regex::new_many](https://docs.rs/regex-automata/latest/regex_automata/meta/struct.Regex.html#method.new_many), 
18an array of many pattern strings is required, as there is no syntax 
19for one string literal representing a compound regex.
20
21## Features
22The crate delivers literal formats for regex and regex sets with the 
23following punctuations:
24
25* `//` (a pair of forward slashes) as the default delimiters that enclose a 
26pattern. 
27 
28* `[]` (a pair of square brackets) that hold a union of multiple patterns 
29(abbr. as 'ReU').
30
31* `<>` (a pair of angle brackets) that hold a sequence of regex patterns and/or
32 pattern unions (abbr. as 'ReS') that iterates over consecutive matchings.
33
34* `,` (comma) serves as seperator in between regex pattern literals, while any 
35whitespace unicode character[^3] is skipped in parsing.
36
37### Samples of regex literals
381. a simple pattern : `r#"/ab+c/"#`
392. a regex union literal: `r#"[/(?i)ab+c/,/(?u)\s{2}D+/]"#`
403. a regex sequence literal:  `r#"</(?i)ab+c/,/(?u)\s{2}D+/>"#`
414. another regex sequence literal: `r#"<[/(?i)ab+c/,/(?u)\s{2}D+/],/\s*\w+/>"#`
42
43Note that [`crate::delimited::set_delimiter()`] allows choosing a customized 
44delimiter from [`crate::delimited::DELIMITER_CHARS`](static@DELIMITER_CHARS).
45In addition, [`crate::util`] module provides public functions of text 
46convertion between undelimited and delimited patterns.
47
48## Building Regex structs from regex-literal
49The regular expression structs can be constructed via either 
50[`crate::XRegex::try_from`],[`crate::XRegex::from_str`]  or 
51[`crate::XRegex::new`]. The former two use the default regex literal delimiter 
52("/" transcoded in [`crate::delimited::DELIMITER`]); the latter allows a 
53customised delimiter. An easy alternative is to use macro 
54`xregex!` [`crate::xregex`] when constructing XRegex with literals.
55
56### Examples
57
58```rust
59use regex_literal::{XRegex,FromStr,Regex,Match,PatternID,Input,Anchored,xregex};
60
61//example 0: create a XRegex structs from a one-pattern literal by xregex!()
62let text = "abc123";
63//construct XRegex
64let xre = xregex!(r"/^[a-z]+\d{3}$/"); 
65// equivalent to the following variances - (1) XRegex::try_from(br"/^[a-z]+\d{3}$/").unwrap() (2) XRegex::from_str(r"/^[a-z]+\d{3}$/").unwrap() (3) XRegex::new(r"/^[a-z]+\d{3}$/",b"/").unwrap()
66//get regex reference from XRegex struct
67let re = xre.as_regex().unwrap();
68//check if the one pattern regex matches with the target text
69assert!(re.is_match(text));
70
71//example 1: create a XRegex struct from a one-pattern literal
72let text0 = "abc123";
73//create one-pattern literal
74let re0 = r#"/^[a-z]+\d{3}$/"#;
75//construct XRegex
76let x = re0.parse::<XRegex>().unwrap();//let x = XRegex::from_str(re0).unwrap();
77//get Regex from XRegex struct
78let x_one_pattern = x.as_regex().unwrap();
79//check if the one pattern regex matches with the target text
80assert!(x_one_pattern.is_match(text0));
81//find the first match if it exists
82let m = x_one_pattern.find(text0);
83assert_eq!(m,Some(Match::must(0,0..6)));
84
85//example 2: create a XRegex struct from a one-pattern literal
86let text1 = "ABBBC abc123";
87let re1 = "!!!!(?i)ab+c!!!!";
88//construct XRegex
89let y = XRegex::new(re1,b"!!!!").unwrap();
90//get Regex from XRegex struct
91let y_one_pattern = y.as_regex().unwrap();
92// check if this one pattern regex matches with the input
93assert!(y_one_pattern.is_match(text1));
94//find all non-overlapping leftmost matches
95let matches:Vec<Match> = y_one_pattern.find_iter(text1).collect();
96assert_eq!(matches,vec![Match::must(0,0..5),Match::must(0,6..9),]);
97
98//example 3: create a XRegex struct from a multiple-pattern literal
99let reu = r"[/(?i)ab+c/,/\w+/]";
100let mut m1 = XRegex::from_str(reu).unwrap();
101//get Regex from XRegex struct
102let m_patterns = m1.get_regex().unwrap();
103assert!(m_patterns.is_match(text1));
104let m_matches:Vec<Match> = m_patterns.find_iter(text1).collect();
105assert_eq!(m_matches,vec![Match::must(0,0..5),Match::must(0,6..9),Match::must(1,9..12)]); //non-overlapping leftmost matches
106
107let expected = Some(Match::must(1,0..7));
108let input = Input::new("23ABBBC abc&").anchored(Anchored::Pattern(PatternID::must(1)));//choose the specific pattern for input
109let n_patterns = XRegex::from_str(reu).unwrap().get_regex().unwrap();
110let mut caps = n_patterns.create_captures();
111n_patterns.search_captures(&input,&mut caps);
112assert_eq!(expected, caps.get_match());
113
114//example 4: create a XRegex struct from a regex sequence literal
115let rel = br#"</(?i)ab+c/,/^\w+?\d+$/>"#;
116let xre2= XRegex::try_from(&rel[..]).unwrap();
117let seq_slice = xre2.as_slice().unwrap();
118let child_regex = &seq_slice[1];
119assert!(child_regex.is_match("abc333"));
120
121```
122
123## Conversion of regex literals
1241. [`crate::util::delimit`] and [`crate::util::undelimit`] provide regex 
125literal conversion between undelimited and delimited forms.
126
127### Examples
128
129``` rust
130# use regex_literal::util::{delimit,undelimit};
131 
132let delimiter = "/";
133// a regex literal that includes delimiter(forward slash `/`)
134let re1 = r"\d{2}/\d{2}/\d{4}";
135let delimited1 = delimit(re1,delimiter);
136let string1 = r"/\d{2}\/\d{2}\/\d{4}/";
137assert_eq!(&delimited1[..],string1);
138
139let undelimited = undelimit(&delimited1[..],delimiter).unwrap();
140assert_eq!(&undelimited[..], re1);
141
142```
143
1442. [`crate::assembly::into_reu`] and [`crate::assembly::into_res`] annotate 
145patterns with default delimiters into delimited literals of regular expression 
146union and sequence accordingly. Note the transformations require feature "w".
147
148### Examples
149
150``` rust
151# use regex_literal::assembly::into_reu;
152let re1 = "(?i)ab+c";
153let re2 = r"\w+";
154let re_set = [re1,re2];
155let reu = into_reu(&re_set);
156assert_eq!(reu,r"[/(?i)ab+c/,/\w+/]".to_owned());
157
158```
159
160
161## Acknowledgements
162[`regex-literal`] has adopted PCRE-style delimiters on top of regex engines in Rust crate regex-automata.
163
164[^1]: [literal expressions](https://doc.rust-lang.org/reference/expressions/literal-expr.html)
165
166[^2]: [PCRE flavor](https://pcre.org/original/doc/html/pcretest.html)
167
168[^3]: [Unicode characters with property White_Space=yes](https://en.wikipedia.org/wiki/Whitespace_character#Unicode)
169
170---
171
172
173
174*/
175
176//includes the changlog markdown file for documenting everything into one 
177
178#![doc = include_str!("../CHANGELOG.md")]
179
180#![allow(unused)]
181extern crate alloc;
182mod error;
183pub mod util;
184
185pub mod delimited;
186pub use alloc::str::FromStr;//trait implemented in XRegex 
187pub use delimited::*;
188
189/// construct XRegex with the following arguments: $l - regex string literal, $d_bytes - the byte string literal of delimiter
190#[macro_export]
191macro_rules! xregex {
192	($l:literal) => {
193		$l.parse::<$crate::XRegex>().unwrap() //XRegex::from_str(literal).unwrap()
194	};
195	($l:literal,$d_bytes:literal) => {
196		$crate::XRegex::new($l,$d_bytes).unwrap()	
197	};
198}
199
200//https://doc.rust-lang.org/cargo/reference/features.html
201///feature `w` - "wiring"
202#[cfg(any(test, feature = "w"))]
203pub mod assembly; 
204/// feature `c` - "correlation"
205#[cfg(any(test,feature = "c"))]
206pub mod correlation;
207
208//the #[cfg(test)] annotation on the tests module tells Rust to compile and run the test code only when you run cargo test, not when you run cargo build
209#[cfg(test)] 
210mod tests {
211	use super::*;
212	#[test]
213	fn regex_from_single_pattern_literal()  {
214		 let re_str =  "/(?i)ab+c/";
215		 let mut re = XRegex::from_str(re_str).unwrap();
216		 //test literal field
217		 assert_eq!(re.literal.0 as u8, b'/');
218		 assert_eq!(re.literal.1, "/(?i)ab+c/".to_owned());
219		 let my_re = re.get_regex().unwrap();
220		 assert!(my_re.is_match("ABBBC"));	
221	}
222	#[test]
223	fn regex_from_pattern_union_str()  {
224		let reunion_str =  "[/(?i)ab+c/,/(?u)\\s{2}D+/]";
225		let mut re = XRegex::from_str(reunion_str).unwrap();
226		let my_re_set = re.get_regex().unwrap();
227		assert!(my_re_set.is_match("ABBBC"));
228		assert!(my_re_set.is_match("  DD"));
229	}
230
231	
232	#[test]
233	fn resequence_from_literal() {
234		//let seq_str =  r"</(?u)[\u4e00-\u9fa5]{4}/,/\d{3}/>";
235		let seq_str = r"</\s+/,/(\p{script=Han}+)(\d{3})/>";
236		let mut xregex = XRegex::from_str(seq_str).unwrap();
237		let input = "  天下一家1234";
238		assert_eq!(xregex.data.matching_indices(input).unwrap(),vec![0,1]);
239		
240		let seq = xregex.as_slice().unwrap();
241		assert_eq!(seq.len(),2);
242		
243		
244		let regex_ws = &seq[0];
245		let m1 = regex_ws.find(input);
246		assert_eq!(m1,Some(Match::must(0,0..2)));
247		let regex_words = &seq[1];
248		//assert!(regex_words.is_match(input));
249		let mut caps = regex_words.create_captures();
250		regex_words.captures(input,&mut caps);
251		let chinese_span = caps.get_group(1).unwrap();
252		
253		assert_eq!("天下一家",&input[chinese_span.start..chinese_span.end]);
254		
255		
256		/*
257		let mut captures = vec![];
258		for (full,[chinese,digits]) in regex_words.captures_iter(&input).map(|c| c.extract()) {
259			captures.push((chinese, digits.parse::<u16>()?));
260		}
261		//see Captures from https://docs.rs/regex-automata/latest/regex_automata/util/captures/struct.Captures.html#method.extract
262
263		assert_eq!(captures,vec![("天下一家",1234)]);
264		*/
265		
266	}
267
268		#[test]	
269		fn test_macro_xregex(){
270			let mut x = xregex!("/(?i)ab+c/",b"/");
271			let my_re = x.get_regex().unwrap();
272			assert!(my_re.is_match("ABBBC"));	
273		}
274	
275}
276
277
278//this test succeed when run: cargo build --features "w" && cargo test ; https://doc.rust-lang.org/cargo/reference/features.html#command-line-feature-options
279//reference: https://doc.rust-lang.org/cargo/commands/cargo-build.html
280#[cfg(test)] 
281mod tests_on_w_feature{
282	use super::*;
283	use assembly::{assemble_reu,assemble_res,into_reu,into_res};
284	#[test]
285	fn test_assemble_reu(){
286			let reu = r"[/(?i)ab+c/,/\w+/]";
287			let m1 = XRegex::from_str(reu).unwrap();
288			
289			let re1 = "/(?i)ab+c/";
290			let re2 = r"/\w+/";
291			let arr_re = [re1,re2];
292			let assembled = assemble_reu(&arr_re).unwrap();
293			assert_eq!(reu.to_owned(),assembled);
294			let m2 = XRegex::from_str(&assembled).unwrap();
295			assert_eq!(m1,m2);		
296	}
297
298	#[test]
299	fn test_assemble_res(){
300			let res = r"</(?i)ab+c/,/\w+/>";
301			let m1 = XRegex::from_str(res).unwrap();
302			
303			let re1 = "/(?i)ab+c/";
304			let re2 = r"/\w+/";
305			let arr_re = [re1,re2];
306			let assembled = assemble_res(&arr_re).unwrap();
307			assert_eq!(res.to_owned(),assembled);
308			let m2 = XRegex::from_str(&assembled).unwrap();
309			assert_eq!(m1,m2);		
310	}
311
312	#[test]
313		fn test_into_reu(){
314			let re1 = "(?i)ab+c";
315			let re2 = r"\w+";
316			let re_set = [re1,re2];
317			let reu = into_reu(&re_set);
318			assert_eq!(reu,r"[/(?i)ab+c/,/\w+/]".to_owned());
319			
320		}
321	#[test]
322		fn test_into_res(){
323			let re1 = "(?i)ab+c";
324			let re2 = r"\w+";
325			let re_set = [re1,re2];
326			let res = into_res(&re_set);
327			assert_eq!(res,r"</(?i)ab+c/,/\w+/>".to_owned());
328			
329		}
330
331}
332
333#[cfg(test)] 
334mod tests_on_c_feature{
335	use super::*;
336	use correlation::par_matching_indices;
337
338	#[test]
339	fn test_parallel_matching(){
340		let seq_str = r"</\s+/,/(\p{script=Han}+)(\d{3})/>";
341		let mut xregex = XRegex::from_str(seq_str).unwrap();
342		let input = "  天下一家1234";
343		let seq = xregex.as_slice().unwrap();
344		assert_eq!(par_matching_indices(seq,input).unwrap(),vec![0,1]);
345	}
346
347}
348