regex_literal/
delimited.rs

1 /*!
2# The core module of delimited literals
3 [`crate::delimited`] defines an enumeration type [`XRegex`] that represents a 
4 choice from [`Regex`] struct and [`ReSequence`] structs. Its associated 
5 functions try_from and from_str call `parse` and `compile` functions 
6 sebsequently that convert any delimited regular expression literal 
7 into compiled struct via a intermediate data BTreeMap<u32,Meta>.
8*/
9
10#![allow(unused)]
11
12pub use regex_automata::{meta::{Regex,BuildError},Span,Match,PatternID,Input,Anchored};
13pub use crate::error::Error; 
14use crate::util;
15
16use core::cell::Cell;// the wrapper structure as a mutable pointer to bytes vector in thread local static 
17use alloc::collections::btree_map::BTreeMap;// the data structure for meta_map
18
19use core::convert::TryFrom; //TryFrom trait used by 1 u32 ; 2.XRegex
20use alloc::str;
21use alloc::str::FromStr; //FromStr trait used by XRegex
22
23use core::cmp::PartialEq;
24
25use core::result::Result;
26use alloc::string::String;
27
28use core::slice::{Iter,IterMut};
29use alloc::vec::{Vec,IntoIter};
30
31//use thread_local macro reference: https://doc.rust-lang.org/std/macro.thread_local.html
32thread_local! {
33/// A static variable for storing regex literal delimiter (1-4 single-byte punctuations transcoded into u32 value).
34	pub static DELIMITER:Cell<u32> = const {Cell::new(b'/' as u32)};
35}
36
37/// the punctuations used as the delimiters in regex sets (regex union and sequence)
38pub static RE_SET_DELIMITERS:[[u8;2];2] = [[b'[',b']'],[b'<',b'>']];
39/// the punctuations used as the SEPARATORs among items in regex sets (regex union and sequence)
40pub static RE_SEPARATOR:u8 = b',';
41
42
43/// DELIMITER_CHARS is a collection of punctuations 
44/// for composing regular expression delimiters. It consists of the 
45/// following punctuations: !#$%&*+,./:;=?@^_|~-
46pub static DELIMITER_CHARS: [u8;20] = [0x21u8,0x23u8,0x24u8,0x25u8,0x26u8,0x2au8,0x2bu8,0x2cu8,0x2eu8,0x2fu8,0x3au8,0x3bu8,0x3du8,0x3fu8,0x40u8,0x5eu8,0x5fu8,0x7cu8,0x7eu8,0x2du8];
47
48// A broader range of punctuation selection in forming RE delimiters: any non-alphanumeric, 
49// non-backslash, non-whitespace character.
50// [PCRE Delimiter Reference](https://pcre.org/original/doc/html/pcretest.html)
51// Regex::new(r"^[^<>[\]\w\\\\s]+$").unwrap();
52// Note: in a character class (square brackets) any character except ^, -, ] or \ is a literal.
53
54
55/** 
56function set_delimiter customises delimiters for regex literal.
57The passed delimiter value (as a byte array) consists of 1 or upto 4 single-byte 
58characters. It is firstly validated against [`DELIMITER_CHARS`]; and then 
59is converted into u32 and stored in thread_local static [`DELIMITER`] and the 
60function returns true if it is valid; otherwise [`DELIMITER`] is not updated 
61and the function reuturns false. 
62
63A regular expression literal (reliteral) is enclosed by delimiters. 
64A pair of forwardslashes -- "/pattern_text/" originally from the 
65matching operator in Perl[^1], is used as the default delimiters in reliteral, 
66which makes itself distinct from the other [Rust literal expressions](https://doc.rust-lang.org/reference/expressions/literal-expr.html)
67
68Any delimiter sequence in the pattern text of reliteral is prepended 
69with a backslash.  To avoid using too many escaped backslashe, reliteral 
70delimiter can be customised. 
71
72*/
73
74pub fn set_delimiter(delimiter:&[u8]) -> bool {	
75	let result = validate_delimiter(delimiter);
76	if result {
77				//update DELIMITER with a u32 value calculated from delimiter's byte.
78				DELIMITER.with(
79					|cell_delimiter| {
80							let code = bytes_to_u32(delimiter).unwrap(); 
81							cell_delimiter.set(code); 
82					}
83				); 
84		}
85	result
86}
87
88pub fn get_delimiter() -> Vec<u8> {
89	let mut bytes = vec![];
90	DELIMITER.with(
91						|cell_delimiter| {
92							let  delimiter_u32 = cell_delimiter.get();
93							bytes = u32_to_bytes(delimiter_u32);
94						}			
95					);
96	bytes
97}
98/**
99function validate_delimiter checks delimiter against [`DELIMITER_CHARS`], 
100A byte sequence that consists of either a single-byte candidate puncutation or 
101multiple (up to 4) repetitive ones is valid for enclosing pattern text. 
102Note bracket style delimiters in many PCRE (Perl Compatible Regular Expressions
103[^2]) engines are excluded from DELIMITER_CHARS, as they are reserved for 
104delimiting elements in regex sets.
105
106Note: 
107* There are two styles of delimiters in PCRE: matched delimiters and 
108bracket-style delimiters. This Rust crate regex-literal only uses matched delimiters: single 
109or mutiple repeated punctuation characters (excluding quote characters: " ' `) in 
110enclosing regex literals. Bracket-style punctuations ([],<>,(),{}, and etc) are reserved 
111for regex set literals. 
112[^1]: <https://perldoc.perl.org/perlre>
113[^2]: <https://pcre.org/original/doc/html/pcretest.html>
114
115*/
116
117pub fn validate_delimiter(delimiter:&[u8]) -> bool {
118	let delim_length = delimiter.len();
119	if  delim_length > 4 {return false;} //limit delimiter size up to 4 bytes
120	let first = delimiter[0];
121	if DELIMITER_CHARS.iter().any(|&x| x == first) {
122				if delim_length > 1 { // if multiple-byte delimiter
123				let rest = &delimiter[1..];
124				rest.iter().all(|&x| x == first) //only repetitive bytes(punctuation characters) are allowed
125				} else {true} //else for one-byte delimiter
126	} else {false}	
127}
128
129/// Convert a bytes array (1-4 bytes) into a u32 value.
130fn bytes_to_u32(bytes:&[u8]) -> Option<u32> {
131	let byte_length = bytes.len();
132	if  byte_length > 4 {None}
133	else {
134		let mut code:u32 = 0;
135		for (i, byte) in bytes.iter().enumerate(){
136			let increment = *byte as u32;
137			let left_shifted = ((byte_length - i - 1)*8) as u32;
138			code += increment << left_shifted;
139		}
140		Some(code)	
141	}	
142}
143
144///convert a u32 value into a byte array with the prefix 0 trimmed off.
145fn u32_to_bytes(c:u32) -> Vec<u8> {
146	let mut rt:Vec<u8> = Vec::new();
147	let mut to_be_trimmed = true;
148	for code in c.to_be_bytes() {
149		if (to_be_trimmed){
150			if (code != 0){
151				to_be_trimmed = false;	
152				rt.push(code);
153			}
154			//skipping 0
155		}else {
156			rt.push(code);
157		}
158	}
159	if rt.is_empty() {rt.push(0)}
160	rt
161}
162
163/// ReSequence is the sequence of regex_automata::Regex (can be either 
164/// single-pattern or multiple-pattern)  that can be utilized in a timeline
165/// /series of matching events. Its method matching_indices provide vector data
166/// for including Naive Bayers Classifiers in future.
167#[derive(Debug,Clone)] //To debug a struct in Rust, you can use the Debug trait. The Debug trait provides a way to format the output of a struct in a programmer-facing, debugging context
168pub struct ReSequence(Vec<Regex>);
169
170impl ReSequence {
171	/// Construct a new, empty `ReSequence`
172	/// The Regex vector as resequence's field 0 will not be allocated in initialization   
173    #[inline]
174	pub const fn new() -> Self { // need transfrom like this https://docs.rs/regex/latest/src/regex/regexset/string.rs.html? No
175		 ReSequence(Vec::new()) 
176	}
177	/// Appends an element to the back of a collection.
178	#[inline]
179	pub fn push(&mut self, elem:Regex)	{
180		self.0.push(elem);
181	}
182	/// return the number of elements in ReSequence struct.
183	#[inline]
184	pub fn len(&self) -> usize {
185        self.0.len()
186	}
187
188	/// tell if the ReSequence struct is empty or not
189	#[inline]
190		pub fn is_empty(&self) -> bool {
191	        self.0.is_empty()
192		}
193	// https://users.rust-lang.org/t/newtype-pattern-for-vec-how-to-implement-iter/52653/2
194	/// Returns an iterator over the slice.
195	///
196	/// The iterator yields all items from start to end
197	#[inline]
198	pub fn iter(&self) -> Iter<'_, Regex> {
199	        self.0.iter()
200	 }
201	 
202    /// Returns an iterator that allows modifying each value.
203    /// The iterator yields all items from start to end.
204    #[inline]
205	 pub fn iter_mut(&mut self) -> alloc::slice::IterMut<'_, Regex> {
206	         self.0.iter_mut()
207	 }
208
209	 
210	 /// Removes the last element from a vector and returns it, or None if it is empty.
211	 #[inline]
212	 pub fn pop(&mut self) -> Option<Regex> {	 	
213	 	self.0.pop()
214	 }
215
216	/// Removes the last element from a vector and returns it, or None if it is empty.
217		 #[inline]
218		 pub fn remove(&mut self,index: usize) -> Regex {	 	
219		 	self.0.remove(index)
220		 }
221
222	/// Resequence is used as slice.
223	#[inline]
224	 pub fn as_slice(&self) -> &[Regex] {
225	 	self.0.as_slice()
226	 }
227
228//@TODO document `size limit` in documentation;
229/// get matching regex indices in Resequence upon one text target
230	pub fn matching_indices(&self,target:&str) -> Result<Vec<u16>,String> {
231		
232		let data_length = self.0.len();
233		let max_index = u16::MAX as usize;
234		if data_length == 0 {Err("Unable to do matching by an empty ReSequence struct".to_string())}
235		else if max_index < data_length - 1{
236			Err("Unable to do matching as this XRegex data contains more than 2^16 regex structs".to_string())
237		}else {
238			let index_iter = self.0.iter().enumerate()
239						.filter(|&(i,regex_ref)| regex_ref.is_match(target))
240						.map(|(i,_)| i as u16);
241			let indices: Vec<u16> = index_iter.collect();
242			Ok(indices)
243		}	  
244	}	 
245}
246
247
248//Implementing Iterator https://doc.rust-lang.org/std/iter/index.html
249//ref example https://stackoverflow.com/questions/34733811/what-is-the-difference-between-iter-and-into-iter
250//ref example https://stackoverflow.com/questions/30218886/how-to-implement-iterator-and-intoiterator-for-a-simple-struct
251
252/// The method that converts ReSequence into an Iterator, which works with `for` syntax.
253//#[cfg(not(no_global_oom_handling))]
254impl IntoIterator for ReSequence {
255    type Item = Regex;
256    type IntoIter = IntoIter<Self::Item>;
257    
258	#[inline]
259    fn into_iter(self) -> Self::IntoIter {
260        self.0.into_iter()
261    }   
262}
263
264/// Create a value from an iterator.
265//reference https://doc.rust-lang.org/std/iter/trait.FromIterator.html
266//#[cfg(not(no_global_oom_handling))]
267impl FromIterator<Regex> for ReSequence {
268	#[inline]
269	fn from_iter<I: IntoIterator<Item = Regex>>(iter: I) -> Self {
270			let mut c = ReSequence::new();
271	        for i in iter {
272	        	c.push(i);
273	        }
274	        c
275	    }	    	
276}
277
278
279
280/// identifiers for regex literal kinds
281#[derive(Debug,Clone)]
282pub enum LiteralForm{
283	///literal representation of [`crate::ReSequence`] 
284	ReS, 
285	///literal representation of [`Regex`] that holds multiple patterns
286	ReU,
287	///literal representation of [`Regex`] that holds one pattern
288	Re,
289} 
290
291/// a collection of regular expression data artifacts
292#[derive(Debug, Clone)]
293pub struct XRegex {
294	pub literal:(u32,String),//item 0 re_puncts, item 1 literal string (whitespace trimmed during parsing)
295	pub data:ReSequence,
296	pub kind:LiteralForm,
297}
298
299//Compare two XRegex structs, reference https://doc.rust-lang.org/std/cmp/trait.PartialEq.html
300
301impl PartialEq for XRegex {
302	fn eq(&self, other: &Self) -> bool {
303		self.literal.0 == other.literal.0 && self.literal.1 == other.literal.1 
304	}
305}
306
307
308//TODO: XRegex use some design pattern in future? https://rust-unofficial.github.io/patterns/patterns/behavioural/strategy.html
309
310/// convert reliteral bytes to XRegex
311impl TryFrom<&[u8]> for XRegex {
312		type Error = Error;//crate::error::Error
313	fn try_from(value: &[u8]) -> Result<Self,Self::Error> {
314		DELIMITER.with(
315					|cell_delimiter| {	
316					let  delimiter_u32 = cell_delimiter.get();
317					let re_puncts:Vec<u8> = u32_to_bytes(delimiter_u32);
318					let metadata = parse(value,&re_puncts[..])?;
319					compile(value,metadata,&re_puncts[..])
320					}
321		)			
322	}
323}
324
325//TODO: other construction option:read file as binary into utf-8 string: use std::fs; String::from_utf8_unchecked(&fs::read("address.txt")?)
326//https://doc.rust-lang.org/std/str/trait.FromStr.html
327/// convert reliteral string (&str) into XRegex
328impl FromStr for XRegex {
329	type Err = Error;
330	fn from_str(value:&str) -> Result<Self,Self::Err> {
331		Self::try_from(value.as_bytes())
332	}	
333}
334
335impl XRegex {
336	/// XRegex constructor creates XRegex struct with a customised re_puncts
337	pub fn new(re_text:&str,re_puncts:&[u8]) -> Result<Self,Error> {
338		if !validate_delimiter(re_puncts) {
339			let mut msg = "Invalid delimiter:".to_owned();
340			let msg_body = match str::from_utf8(re_puncts) {
341				Err(_) => "non-utf8-code",
342				Ok(puncts_str) => puncts_str,
343			};
344			msg.push_str(msg_body);
345			Err(Error::Syntax(msg))
346		}else {
347			let reliteral = re_text.as_bytes();
348			let metadata = parse(reliteral,re_puncts)?;
349			compile(reliteral,metadata,re_puncts)
350		}	
351	}
352	
353	/// check if XRegex is Resequence or not
354	pub fn is_seq(&self) -> bool {
355		matches!(self.kind,LiteralForm::ReS)
356		}
357
358	/// remove the first regex item from XRegex struct and returns it, or None if it is empty
359	pub fn get_regex(&mut self) -> Option<Regex> {
360		if self.data.is_empty() {None}
361		else {Some(self.data.remove(0))}
362	}
363	/// get regex reference to this XRegex struct. None is returned if it is not item kind.
364	pub fn as_regex(&self) -> Option<&Regex> { //https://stackoverflow.com/questions/29662807/how-do-you-borrow-a-mutable-reference-during-a-match
365		if self.is_seq() || self.data.is_empty() {None}
366		else {
367				let slice = self.data.as_slice();
368				Some(&slice[0])
369		}
370	}
371
372	/// get regex sequence from XRegex struct. None is returned if it is not seq kind.
373	pub fn as_slice(&self) -> Option<&[Regex]> {
374	 	if self.is_seq() {
375	 		Some(self.data.as_slice())
376	 	}else {None}	
377	}
378
379	/// tell if its data is empty
380	pub fn is_empty(&self)	-> bool {
381		self.data.is_empty()
382	}
383
384	
385}
386
387//may develop a WASM version of `Meta`. memory allocation for wasm stack memory: 4G
388//WebAssembly linear memory objects have sizes measured in pages. Each page is 65536 (2^16) bytes. In WebAssembly version 1, a linear memory can have at most 65536 pages, for a total of 2^32 bytes (4 gibibytes).
389//https://stackoverflow.com/questions/40417774/memory-limits-in-webassembly#:~:text=1%20current%20WebAssembly%20implementations%20follow%20a%2032bit%20addressing,pages%20as%20something%20more%20%22safe%22%20than%20desktop%20applications.
390
391/// intermediate data in between parse and compile stages when converting reliteral to XRegex
392struct Meta {
393	/// kind is in accordance with the variants in XRegex 
394	kind:LiteralForm,
395	//the content range with opening and closing delimiters excluded
396	range:(u32,u32),
397	// an optional list containing the children indices
398	children:Option<Vec<u32>>,
399}
400
401
402impl Meta {
403    /// add child meda index to the meta children list
404    fn add_child(&mut self,child_index:u32) {
405		match &mut self.children { //match &mut self.children ??
406			Some(ref mut vec) => {vec.push(child_index);}, // &mut, ref mut are omitted as the compiler can infer them
407			//&mut Some(ref mut vec) => {vec.push(child_index)},
408			//https://stackoverflow.com/questions/29662807/how-do-you-borrow-a-mutable-reference-during-a-match
409			None => {self.children = Some(<Vec<u32>>::from([child_index]));},
410			//&mut None => {self.children = Some(<Vec<u32>>::from([child_index]));},
411		}
412	}
413	/// appoint the meta's closing range 
414	fn finalise(&mut self, right_range:u32){
415		self.range.1 = right_range;
416	}
417}
418/// construct meta_re specify the kind as LiteralForm::Re with the range excluding delimiters 
419fn create_meta_re(start:usize,end:usize) -> Meta {
420	Meta{kind:LiteralForm::Re,range:(start as u32,end as u32),children:None}							
421}
422
423/// analyse reliteral (the byte form of regex literal) with the preset 
424/// re_puncts (the byte form of regex delimiter), a tuple of root meta 
425/// index in reliteral and meta_map is returned when the execution is successful.
426/// The max reliteral length is set to 32 bits, which makes the produced keys 
427/// in meta_map (BTreeMap) is confined to u32.
428fn parse(reliteral: &[u8],re_puncts: &[u8]) -> Result <(u32,BTreeMap<u32,Meta>),Error>{
429	//metaMap stores all the meta data of literal forms and their indices
430	let mut meta_map:BTreeMap<u32,Meta> = BTreeMap::new(); 
431	let target_size = reliteral.len();
432	let reliteral_bytes_fitting_in_u32 = u32::try_from(target_size);
433	if let Err(err) = reliteral_bytes_fitting_in_u32 {
434		return Err(Error::Syntax(format!("Invalid reliteral as its size exceeds the limit of 2^32 bytes: {err}")));
435	}
436
437
438	let re_delimiter_length = re_puncts.len();
439	//walk through all bytes of reliteral
440	let mut index:usize = 0;
441	let start = util::offset_ws(reliteral,index);
442	
443	index = start;
444	let walk_over = proceed(&mut index,reliteral,re_puncts,&mut meta_map);
445	if walk_over {	
446		let end = util::offset_ws(reliteral,index);							
447		if end != target_size {
448			return Err(Error::Syntax(format!("Invalid reliteral - an unparsed tail from byte index {end}.")));
449		}
450		if meta_map.is_empty() {
451			return Err(Error::Syntax("Invalid reliteral - no meta data has been parsed.".to_owned()))
452		}								
453		Ok((start as u32,meta_map))
454	} else {
455	Err(Error::Syntax("Unrecognized reliteral format!".to_owned()))
456	}							
457}
458
459	/// proceed reliteral while iterating its byte index i with the provided re_puncts for producing meta_map
460	fn proceed(i:&mut usize,reliteral: &[u8],re_puncts: &[u8],meta_map:&mut BTreeMap<u32,Meta>) -> bool {
461		store_re(None,i,reliteral,re_puncts,meta_map) || 
462		store_reu(None,i,reliteral,re_puncts,meta_map) || 
463		store_res(None,i,reliteral,re_puncts,meta_map) 
464	}
465		
466		// pc_index parent closure index 
467		fn store_re(pc_index:Option<u32>,i:&mut usize,reliteral: &[u8],re_puncts: &[u8],meta_map:&mut BTreeMap<u32,Meta>) -> bool {
468			let j = *i;
469			let this_index = j as u32;
470			let re_delimiter_length = re_puncts.len();
471			if let Some(indices) = find_re_range(j,reliteral,re_puncts) {
472				
473				meta_map.insert(this_index,create_meta_re(indices[0],indices[1]));//create and insert re item
474				if	let Some(pos) = pc_index { // add the index to its parent meta when it exists
475					if let Some(meta) = meta_map.get_mut(&pos) {
476						meta.add_child(this_index); 
477					} else {
478						return false;//throw error when the parent item can't be located by index in meta_map
479					}	
480				}
481				*i = indices[1] + re_delimiter_length;
482				true
483			}else {false}								
484		}
485		
486		fn store_reu(pc_index:Option<u32>,i:&mut usize,reliteral: &[u8],re_puncts: &[u8],meta_map:&mut BTreeMap<u32,Meta>) -> bool {
487			let start = *i;
488			let this_index = start as u32;
489			if (reliteral[start] == RE_SET_DELIMITERS[0][0]){ //matching ReUnion delimiter
490				*i += 1; 
491				let mut this_meta = Meta{kind:LiteralForm::ReU,range:(*i as u32,*i as u32),children:None};
492				meta_map.insert(this_index,this_meta);
493				*i = util::offset_ws(reliteral,*i);//stepping over whitespace characters
494				
495				if !store_re(Some(this_index),i,reliteral,re_puncts,meta_map) {
496					return false;
497				}
498				//make sure getting first item
499				let mut proceeding = true;
500				while proceeding { //iteratively collecting RE SEPARATOR and item
501					*i = util::offset_ws(reliteral,*i);//stepping over whitespace characters after the re item
502					if reliteral[*i] == RE_SEPARATOR { //matching the SEPARATOR character
503						*i += 1; //stepping RE_SEPARATOR
504						*i = util::offset_ws(reliteral,*i);//stepping over whitespace characters after RE_SPERATOR
505							
506						if !store_re(Some(this_index),i,reliteral,re_puncts,meta_map) {
507							return false;
508						}//the function returns a false abnormality when no item follows RE_SEPARATOR
509					}else {proceeding = false;}
510				}
511				if	reliteral[*i] == RE_SET_DELIMITERS[0][1]  { //matching the closing delimiter of ReU
512					//assign i to the right range
513					
514					if let Some(this_meta) = meta_map.get_mut(&this_index){
515						this_meta.finalise(*i as u32);
516					}
517					
518					 
519					if let Some(pos) = pc_index {
520						if let Some(meta) = meta_map.get_mut(&pos) {
521							meta.add_child(this_index); 
522						} else {
523							return false;//unable to locate the parent item by index in meta_map
524						}	
525					}														
526					*i += 1; //stepping over the closing RE_SET_DELIMITER
527					return true;
528				}
529			} //end of if matching ReUnion delimiter
530			false	
531		}
532		
533		fn store_res(pc_index:Option<u32>,i:&mut usize,reliteral: &[u8],re_puncts: &[u8],meta_map:&mut BTreeMap<u32,Meta>) -> bool {
534			let start = *i;
535			let this_index = start as u32;
536			if (reliteral[start] == RE_SET_DELIMITERS[1][0]){ //matching ReSequence delimiter
537				*i += 1; 
538				let this_meta = Meta{kind:LiteralForm::ReS,range:(*i as u32,*i as u32),children:None};
539				meta_map.insert(this_index,this_meta);
540				*i = util::offset_ws(reliteral,*i);//stepping over whitespace characters
541				if !store_re(Some(this_index),i,reliteral,re_puncts,meta_map) 
542               && !store_reu(Some(this_index),i,reliteral,re_puncts,meta_map) {
543					return false;
544				}
545				//make sure getting first item
546				let mut proceeding = true;
547				while proceeding { //iteratively collecting RE SEPARATOR and item
548					*i = util::offset_ws(reliteral,*i);//stepping over whitespace characters after the re item
549					if reliteral[*i] == RE_SEPARATOR { //matching the SEPARATOR character
550						*i += 1; //stepping RE_SEPARATOR
551						*i = util::offset_ws(reliteral,*i);//stepping over whitespace characters after RE_SPERATOR
552						if !store_re(Some(this_index),i,reliteral,re_puncts,meta_map) && 
553						!store_reu(Some(this_index),i,reliteral,re_puncts,meta_map) {
554							return false;
555						}
556					}else {proceeding = false;}
557				}
558				if	reliteral[*i] == RE_SET_DELIMITERS[1][1]  { //matching the closing delimiter of ReS
559					//assign i to the right range
560					if let Some(this_meta) = meta_map.get_mut(&this_index){
561						this_meta.finalise(*i as u32);
562					}
563					
564					 
565					if let Some(pos) = pc_index {
566						if let Some(meta) = meta_map.get_mut(&pos) {
567							meta.add_child(this_index); 
568						} else {
569							return false;//unable to locate the parent item by index in meta_map
570						}	
571					}													
572					
573					*i += 1; //stepping over the closing RE_SET_DELIMITER										
574					return true;
575				}
576			} //end of if matching ReSequence delimiter
577		  false						
578		} //end of fn store_res								
579
580
581
582/// Construct regex from meta_ref, save it into pool, and return literal back
583fn compile_re(source:&[u8],pool:&mut ReSequence,meta_ref:&Meta,re_puncts:&[u8]) -> Result<String,Error> {
584		let start = meta_ref.range.0 as usize;
585		let stop = meta_ref.range.1 as usize;
586		let regex = regex_from_delimited_literal(&source[start..stop],re_puncts)?;	
587		pool.push(regex);//store to pool data
588		let re_puncts_length = re_puncts.len();
589		let full_start = start-re_puncts_length;
590		let full_stop = stop+re_puncts_length;
591		let postback_bytes = &source[full_start..full_stop];
592		match str::from_utf8(postback_bytes) {
593			Ok(postback) => Ok(postback.to_owned()) ,
594			Err(error) => Err(Error::from_utf8_error(error,full_start)) ,
595		}		
596}
597
598
599/// construct regexset from meta_ref and meta_map, save it into pool, and return literal
600fn compile_reu(source:&[u8],pool:&mut ReSequence, meta_ref:&Meta,meta_map:&BTreeMap<u32,Meta>,re_puncts:&[u8]) 
601	-> Result<String,Error> {
602	let range:[u32;2] = [meta_ref.range.0,meta_ref.range.1]; 
603	if let Some(ref children_indices) = meta_ref.children { 
604	//an alternative way is to use children_indices.iter().map (|x| ...)
605		
606		let mut re_union:Vec<&[u8]> = Vec::new(); //store regex patterns (without delimiters) into regex union
607		//refactor re_bytes to re_literals
608		let mut re_literals:Vec<&str> = Vec::new(); //store regex literals (including regex delimiters) into regex literal 
609		let puncts_length = re_puncts.len();
610		//an array of u8 vector is needed to be constructed beforehand
611		//https://stackoverflow.com/questions/70510299/how-to-declare-a-static-array-of-vectors#:~:text=If%20you%20know%20the%20size%20of%20the%20%22vec%22,This%20lets%20you%20write%20something%20like%20the%20following%3A
612		for child_start_index in children_indices.iter() {
613			if let Some(child_meta_ref) = meta_map.get(child_start_index){
614
615				let start = child_meta_ref.range.0 as usize;
616				let stop = child_meta_ref.range.1 as usize;
617				let full_start = start - puncts_length;
618				let full_stop = stop + puncts_length;
619			
620				let re_item = &source[start..stop];
621				re_union.push(re_item);
622
623				match str::from_utf8(&source[full_start..full_stop]) { //get literal
624					Ok(re_literal) => re_literals.push(re_literal),
625					Err(err) => return Err(Error::from_utf8_error(err,full_start)),
626				}
627			} else { //the case that child meta is not found by the child_start_index
628				return Err(Error::Syntax(
629					format!("The literal of ReU (RegexUnion) ranging {range:?} does not have valid Regex item at byte index {child_start_index}.")
630				));
631			}
632		}
633		if re_union.is_empty() {
634			return Err(Error::Syntax(
635				format!("The literal for ReU (RegexUnion) ranging {range:?} contains 0 regex item.")
636			));
637		}
638
639		let regexset = regexset_from_delimited_literals(&re_union[..],re_puncts)?;
640		pool.push(regexset);
641		let mut postback_string = String::from("");
642		postback_string.push(char::from_u32(RE_SET_DELIMITERS[0][0] as u32).unwrap());
643		postback_string.push_str(&re_literals.join(str::from_utf8(&[RE_SEPARATOR]).unwrap()));
644		postback_string.push(char::from_u32(RE_SET_DELIMITERS[0][1] as u32).unwrap());
645		Ok(postback_string)	
646	} else { //when meta_ref.children is None
647		Err(Error::Syntax(
648		format!("The literal for ReU (Regex Union) ranging {range:?}  does not have valid Regex item.")
649		))
650	}
651} 
652
653/// construct a vector of regex data from meta_ref and meta_map, save it to pool and post back literal
654fn compile_res(source:&[u8],pool:&mut ReSequence, meta_ref:&Meta,meta_map:&BTreeMap<u32,Meta>,re_puncts:&[u8]) -> 
655Result<String,Error> {	
656		let range:[u32;2] = [meta_ref.range.0,meta_ref.range.1];
657		
658		if let Some(ref children_indices) = meta_ref.children {
659		 	let sequence_length = children_indices.len();
660		 	let mut literal_seq:Vec<String> = Vec::with_capacity(sequence_length);
661		 	for child_start_index in children_indices.iter() {
662				if let Some(child_meta_ref) = meta_map.get(child_start_index){
663						match child_meta_ref.kind {
664							LiteralForm::Re => {
665								let re = compile_re(source,pool,child_meta_ref,re_puncts)?;
666								literal_seq.push(re);
667							},
668							LiteralForm::ReU => {
669								let reu = compile_reu(source,pool,child_meta_ref,meta_map,re_puncts)?;
670								literal_seq.push(reu);
671							},
672							_ => {
673								return Err(Error::Syntax(
674											format!("The literal of ReS (ReSequence) ranging {range:?} has encountered an unhandled meta kind at index {child_start_index}.")
675											)
676										);
677							},
678						}
679				} else { //child meta is not found by child_start_index
680						return Err(Error::Syntax(
681							format!("Within ReS (ReSequence) ranging {range:?}, the Regex item cannot be located by its byte index {child_start_index}.")
682						));
683				}
684		 	}
685		 	if literal_seq.is_empty() {
686				return Err(Error::Syntax(
687					format!("The literal for ReS (ReSequence) ranging {range:?} contains 0 Regex item.")
688				));
689			}
690
691			let mut postback_string = String::from("");
692			postback_string.push(char::from_u32(RE_SET_DELIMITERS[1][0] as u32).unwrap());
693			let joined = &literal_seq[..].join(str::from_utf8(&[RE_SEPARATOR]).unwrap()); //to be tested
694			postback_string.push_str(&literal_seq.join(&joined[..]));
695			postback_string.push(char::from_u32(RE_SET_DELIMITERS[1][1] as u32).unwrap());
696			Ok(postback_string)	
697
698		}  else { 
699			Err(Error::Syntax(
700			format!("The literal for ReS (Regex Sequence) positioned ranging {range:?} has zero Regex item.")
701			))
702		}
703}
704
705 /// the method constructs XRegex data given reliteral source and the parsed metadata
706 fn compile(source:&[u8],parsed:(u32,BTreeMap<u32,Meta>),re_puncts:&[u8]) -> Result<XRegex, Error> {
707			let re_delimiter_length = re_puncts.len();
708			let index = parsed.0;
709			let meta_map = &(parsed.1); 
710			if let Some(meta_ref) = meta_map.get(&index) {
711			
712				let mut pool:ReSequence = ReSequence::new();
713				let mut pool_ref = &mut pool;
714				let delimiter = bytes_to_u32(re_puncts).ok_or(Error::Syntax("Failed in delimiter transcoding.".to_owned()))?;//double check	
715							
716				match meta_ref.kind {
717					LiteralForm::Re => {
718						let re = compile_re(source,pool_ref,meta_ref,re_puncts)?; 
719						Ok(XRegex{data:pool,literal:(delimiter,re),kind:LiteralForm::Re})
720					}, 
721					LiteralForm::ReU => {
722						let reu = compile_reu(source,pool_ref,meta_ref,meta_map,re_puncts)?;
723						Ok(XRegex{data:pool,literal:(delimiter,reu),kind:LiteralForm::ReU}) 
724					}, 
725					LiteralForm::ReS => {
726						let res = compile_res(source,pool_ref,meta_ref,meta_map,re_puncts)?;
727						Ok(XRegex{data:pool,literal:(delimiter,res),kind:LiteralForm::ReS})
728					},
729				} 				
730			} 
731			else {
732				Err(Error::Syntax(format!("No meta data indexed at {index} in meta_map.")))
733			}
734 }
735
736/// Construct regex from a delimited literal. @todo ,refactor it for storing original escaped characters
737fn regex_from_delimited_literal(rebody:&[u8],delimiter:&[u8]) -> 
738Result<Regex,Error> {
739	let unescaped = match util::unescape_from_bytes(rebody,delimiter){
740		Ok(text) => text,
741		Err(err_info) => return Err(Error::Syntax(err_info)),
742	};	
743	Regex::new(&unescaped[..]).map_err(Error::from_meta_build_error)
744}	
745
746/// Construct regex from an array of delimited literals.
747fn regexset_from_delimited_literals(rebodies:&[&[u8]],delimiter:&[u8]) -> 
748Result<Regex,Error> {
749	let mut vec = Vec::new();//for storing  reliterals (String type) representing Re
750	for bytes_ref in rebodies.iter() {
751		let unescaped = match util::unescape_from_bytes(bytes_ref,delimiter){
752			Ok(text) => text,
753			Err(err_info) => return Err(Error::Syntax(err_info)),
754		};
755		vec.push(unescaped.into_owned());
756	}
757	//get the references from iterator, following example:https://doc.rust-lang.org/std/vec/struct.Vec.html#method.iter
758	let mut ref_vec = Vec::new();
759	let vec_refs = &vec;
760	for bytes_ref in vec_refs.iter(){ //iter() iterates over &String (= &str)
761		ref_vec.push(bytes_ref);
762	}
763	Regex::new_many(&ref_vec).map_err(Error::from_meta_build_error)
764}
765
766/// Given the starting index i in reliteral bytes, the function is to find 
767/// the content range of in reliteral in between the pair of delimiters
768/// (represented as re_puncts in UTF-8 bytes).
769/// The boundary indices  are returned if found; otherwise, `None` is returned.
770fn find_re_range(i:usize,reliteral:&[u8],re_puncts:&[u8]) -> Option<[usize;2]> {
771	let re_delimiter_length = re_puncts.len();
772	let target_length = reliteral.len();
773	
774	let mut result:[usize;2] = [0;2];
775	let mut k = i;
776	let mut step:usize = 0;
777	let first_slice_end = k + re_delimiter_length;
778	//the following condition ensures (1) there is valid content length besides the pair of re_delimiters (2) the opening re_puncts has been caught
779	if target_length > (k + 2 * re_delimiter_length)    &&  &reliteral[k..first_slice_end] == re_puncts {
780		result[0] = k + re_delimiter_length;
781		step = re_delimiter_length;
782	}else {return None;}
783	let mut escaped = false;
784	while step > 0 {
785		k += step;
786		if (k + re_delimiter_length) > target_length {
787			return None;//the closing re_delimiter is not found till the end of bytes
788		}
789		let code = reliteral[k];
790		let char_length = util::infer_char_size(code);
791		match char_length {
792			0 => {
793				println!("invalid UTF code is found at index {}",k);
794				return None;
795			},
796			1 => {
797				if code == b'\\' {
798					escaped = !escaped;
799				}else {
800					if !escaped {
801						if &reliteral[k..(k + re_delimiter_length)]	== re_puncts { // the cadidate characters used by re_delimiter are 1-byte character only
802							result[1] = k;
803							return Some(result);
804						}
805					}else {escaped = false;}
806				}	
807			},
808			_ => {
809				if escaped {
810					escaped = false;
811				}	
812			},
813			
814		} // end of match
815		step = char_length as usize;	
816	} //end of while loop
817	None
818	
819}
820
821#[cfg(test)]
822mod tests {
823	use super::*;	
824	#[test]
825	fn test_find_re_range(){
826		let re_bytes =  "/(?i)\\/ab+c\\//".as_bytes();
827		let range = find_re_range(0,re_bytes,&[b'/']);
828		assert_eq!(range.unwrap(),[1,13]);	
829	}
830	
831	#[test]
832	fn test_regex_from_delimited_literal(){ //reviewing up to here 
833		let re0 = regex_from_delimited_literal(br"(?i)ab+c\/",&[b'/']).unwrap();//re_delimiter `/` is presented in escaped in re0
834		assert!(re0.is_match("ABBBC/"));//this assertion test is conducted in the module level.
835	}
836
837	#[test]
838	fn test_regexset_from_delimited_literals(){
839		let my_text = "ABBBC abc123";
840		//let reunion_str =  "[/(?i)ab+c/,/(?u)\\w+D+/]";
841		let item0 = br"(?i)ab+c";
842		let item1 = br"\d+";
843		//https://stackoverflow.com/questions/64309656/how-to-convert-a-rust-array-to-pointer
844		//https://www.hackertouch.com/how-to-print-type-of-variable-in-rust.html
845		//note pass `&reunion_item0[..]` as &[u8], while pass &reunion_item0 as [u8;8]
846		let reunion = [&item0[..],&item1[..]];
847		let my_set = regexset_from_delimited_literals(&reunion,&[b'/']).unwrap();
848		let matches:Vec<Match> = my_set.find_iter(my_text).collect();
849		assert_eq!(matches,vec![Match::must(0,0..5),Match::must(0,6..9),Match::must(1,9..12)]);
850	} 
851}
regex_literal/delimited.rs

regex_literal/
delimited.rs