regex_literal/
util.rs

1//! A collection of helper functions that process literal conversions.
2//comments style reference: https://doc.rust-lang.org/stable/reference/comments.html
3//! # [`delimit`] and [`undelimit`] 
4//! provide text convertion between delimited and undelimited literals.
5
6
7//!
8//! # Others
9//! [`unescape_from_bytes`], and [`unescape`] removes any escape character `/` 
10//! that is prefixed to delimiter sequences in the regular expression literals.
11//! [`escape`] and [`escape_into_bytes`] prepend `/` to all delimiters in input
12//! texts.
13
14
15
16use alloc::borrow::Cow; //used by unescape_from_bytes and unescape
17use core::result::Result;
18use alloc::string::String;
19
20
21/// Use unescape_from_bytes function for removing the escape character '\' 
22/// from each escaped delimiter sequence in input bytes. This returns two 
23/// variants: a Cow typed string if succeeds, and an error if fails.
24pub fn unescape_from_bytes<'a>(input:&'a [u8],delimiter:&[u8]) -> Result<Cow<'a, str>,String> {
25	// Cow knowledge from https://doc.rust-lang.org/std/borrow/enum.Cow.html
26	let mut owned = None;
27	let length = input.len();
28	let chunksize = delimiter.len();
29	let mut i = 0;
30	while i < length   {
31		let thisbyte= input[i];
32		if thisbyte == b'\\' { //when thisbyte is the escape character
33			if (chunksize + i) < length && &input[i+1..i+1+chunksize] == delimiter{
34			//peek the following bytes, and if comes the delimiter, backslash (the escape character) will be removed
35				if owned.is_none() { //create owned that clones all the previous bytes [0..i] before the char `\\` 
36					owned = Some(input[0..i].to_owned());	//the slices' method to_owned is equivalent to as to_vec 
37				}
38				owned.as_mut().unwrap().extend_from_slice(delimiter);//skipping `\\`, clone delimiter to owned
39				 i += chunksize;					
40			} else { //if not the delimiter, backslash is retained 
41						if let Some(text) = owned.as_mut(){
42							text.push(b'\\');
43						} //push the escape char in the `owned` case
44						//else do nothing in the `input` case (a borrowing reference)	
45			}		
46		}else if let Some(text) = owned.as_mut(){ //for any non-escape char, push thisbyte to `owned` if it has been created
47					text.push(thisbyte);
48		}
49		//no else statement following `else if -- as do nothing in the `input` case (a borrowing reference) -- `
50		i += 1;
51	}
52	let mut unescape_invalid_utf8 = String::from("Err in `unescape_from_bytes` execution:");
53	match owned {
54		Some(u8_vec) => {
55			let source = match String::from_utf8(u8_vec){
56				Ok(result_string) =>  result_string,
57				Err(e) => { //details from https://doc.rust-lang.org/std/string/struct.FromUtf8Error.html
58					let err_msg = format!("{e}");
59					unescape_invalid_utf8.push_str(&err_msg);
60					return Err(unescape_invalid_utf8);
61				},
62			};
63			Ok(Cow::Owned(source))
64		},
65		None =>{
66			let source = match std::str::from_utf8(input) {
67				Ok(some_str) => some_str,
68				Err(e) => {
69					let err_msg = format!("{e}");
70					unescape_invalid_utf8.push_str(&err_msg);
71					return Err(unescape_invalid_utf8);
72				},
73			};
74			Ok(Cow::Borrowed(source))
75		},
76	}//end of matching owned
77}
78
79
80
81/// Convert any escaped delimiter sequence in the input string slices. 
82/// Use unescape function for removing the escape character '\' from each 
83/// escaped delimiter sequences in input string slices. This returns two variants,
84/// a Cow wrapped string if succeeds, and an error in String type if fails.
85/// unescape is the reversed function of [`escape`](fn@escape) .
86pub fn unescape<'a>(input:&'a str,delimiter:&str) -> Result<Cow<'a, str>,String> {
87	unescape_from_bytes(input.as_bytes(),delimiter.as_bytes())
88}
89
90
91/// Strip off the enclosing delimiter pair and unescape delimiters from the input content.
92/// undelimit is the reversed function of [`delimit`](fn@delimit) .
93pub fn undelimit(input:&str, delimiter:&str) -> Result<String,String> {
94	let delim_len = delimiter.len();
95	let input_len = input.len();
96	if input_len > 2 * delim_len {
97			let start_chunk = &input[0..delim_len];
98			let r_delim_index = input_len - delim_len;//the right hand delimiter index
99			let end_chunk = &input[r_delim_index..input_len];
100			if start_chunk == delimiter && end_chunk == delimiter {
101				match unescape(&input[delim_len..r_delim_index],delimiter) {
102					Ok(undelimited_cow) => Ok(undelimited_cow.into_owned()) ,
103					Err(msg) => Err(msg),
104				}	
105			}
106			else {Err(format!("The input is not delimitable by the delimiter {delimiter}."))}
107	} else {
108		Err(format!("The input length is shorter than the length of a pair of `{delimiter}`."))
109	}	
110}
111
112/// Use escape function for escaping any delimiter sequence in regular expression text.
113pub fn escape<'a>(input:&'a str,delimiter_str:&str) -> Cow<'a, str> {
114	if input.find(delimiter_str) == None {Cow::Borrowed(input)}
115	else {
116		let mut s = String::from(input);
117			//https://doc.rust-lang.org/std/primitive.str.html#method.rmatch_indices
118		for element in input.rmatch_indices(delimiter_str) {
119			s.insert_str(element.0,"\\")	
120		}
121		Cow::Owned(s)
122	}		
123}
124
125/// Use escape_into_bytes function for escaping delimiter sequences and converting regular expression text into bytes
126pub fn escape_into_bytes<'a>(input:&'a str,delimiter_str:&str) -> Cow<'a, [u8]> {
127	let text = escape(input,delimiter_str);
128	if let Cow::Owned(escaped_string) = text {
129		Cow::Owned(escaped_string.into_bytes()) // collect an iterator into array,ref https://users.rust-lang.org/t/collect-into-an-array/55498
130	}else {Cow::Borrowed(input.as_bytes())}
131} 
132
133/// create a delimited regular expression with the given delimiter string.
134pub fn delimit(input:&str,delimiter_str:&str) -> String {
135	let mut output = String::from(delimiter_str);
136	let escaped = escape(input,delimiter_str);
137	output.push_str(escaped.as_ref());
138	output.push_str(delimiter_str);
139	output
140}
141
142/// testing: creating a string list of delimited regular expressions 
143pub fn delimit_many(inputs:&[&str],delimiter_str:&str) -> Vec<String> {
144	let v:Vec<String> = inputs.iter().map(|x| delimit(x,delimiter_str)).collect();
145	v	
146}
147
148/// check if there is any leading whitespace from a given start byte index and return the offset between start and non-whitespace character
149pub fn offset_ws(reliteral:&[u8],start:usize) -> usize {
150	
151
152//pass start + offset as result, or throw a error let error
153
154	//check the first byte, >> 4 (一个字节由两个Hex数值构成，>>4逻辑移位后得到字节里的代表前半段的Hex看是否为0, ASCII是在128以内)
155	//read one byte, compare with one-byte whitespace characters
156	//read three bytes, compare with three-byte whitespace characters
157
158//       Unicode         | UTF-8 bytes
159// --------------------  | ------
160// 0000 0000 - 0000 007F |     1   0xxxxxxx
161// 0000 0080 - 0000 07FF |     2   110xxxxx 10xxxxxx
162// 0000 0800 - 0000 FFFF |     3   1110xxxx 10xxxxxx 10xxxxxx
163// 0001 0000 - 001F FFFF |     4   11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
164
165// decide whether or not use utf8Error https://doc.rust-lang.org/std/str/struct.Utf8Error.html
166
167	// the following bytes arrays are the byte representation of 25 unicode characters with property White_Space=yes ("WSpace=Y","WS"), reference: <https://en.wikipedia.org/wiki/Whitespace_character#Unicode>
168			// the following shows how unicodes are converted into bytes by rust API.
169			// let ws: char = '\u{3000}';
170			// let ws_str = ws.to_string();
171			// println!("{:x?}",ws_str.as_bytes());
172
173	let ws_as_one_byte: [u8;6] = [0x09,0x0A,0x0B,0x0C,0x0D,0x20];//Unicode 0-7F represented in the same form as one byte - ASCII codes
174	let ws_as_two_bytes: [[u8;2];2] = [[0xc2, 0x85],[0xc2, 0xa0]]; 
175	let ws_as_three_bytes: [[u8;3];17] = [
176		[0xe1, 0x9a, 0x80],[0xe2, 0x80, 0x80],[0xe2, 0x80, 0x81],[0xe2, 0x80, 0x82],
177		[0xe2, 0x80, 0x83],[0xe2, 0x80, 0x84],[0xe2, 0x80, 0x85],[0xe2, 0x80, 0x86],
178		[0xe2, 0x80, 0x87],[0xe2, 0x80, 0x88],[0xe2, 0x80, 0x89],[0xe2, 0x80, 0x8a],
179		[0xe2, 0x80, 0xa8],[0xe2, 0x80, 0xa9],[0xe2, 0x80, 0xaf],[0xe2, 0x81, 0x9f],
180		[0xe3, 0x80, 0x80]
181	];
182	let mut stepping = true;
183	let mut pos = start;
184	while stepping && pos < reliteral.len() {
185		let start_byte = reliteral[pos];
186		let prefix = start_byte >> 4 ;
187		if prefix < 0b1000 { // 1-byte character
188			stepping = ws_as_one_byte.iter().any(|&x| x == start_byte);
189			if stepping {pos += 1 ;}
190		} else {
191			match prefix {
192				0b1101 | 0b1100  => { // two-byte whitespace characters all start with 1100
193					stepping = ws_as_two_bytes.iter().any(|&x| x == reliteral[pos..(pos+2)]);
194					if stepping {pos += 2;}
195				},
196				0b1110 => { // three-byte characters all start with 1110
197					stepping = ws_as_three_bytes.iter().any(|&x| x == reliteral[pos..(pos+3)]);
198					if stepping {pos += 3;}
199				},
200				_ => {stepping = false;}
201			}
202		}
203
204	}
205	pos
206}
207
208
209/// infer character length from 1st byte.
210/// reference: [UTF-8](https://en.wikipedia.org/wiki/UTF-8)
211pub fn infer_char_size (byte:u8) -> u8 {
212	let prefix = byte >> 4;
213	match prefix {
214		0b1111 => 4u8,
215		0b1110 => 3u8,
216		0b1101 | 0b1100 => 2u8,
217		_ => {
218			if prefix < 0b1000 {1u8} //0x7F is the last codepoint in one-byte character group
219			else {0u8} //0 for encountering an invalid prefix/leading byte in UTF-8 encoding
220		}
221	}	
222}
223
224
225
226
227
228#[cfg(test)]
229mod tests {
230	use super::*;
231	#[test]
232		fn test_infer_char_size(){
233			let string1 = '~'.to_string();
234			let bytes1 = string1.as_bytes();
235			let string2 = 'ß'.to_string();
236			let bytes2 = string2.as_bytes();
237			let string3 = '中'.to_string();
238			let bytes3 = string3.as_bytes();
239			assert_eq!(infer_char_size(bytes1[0]),1);
240			assert_eq!(infer_char_size(bytes2[0]),2);
241			assert_eq!(infer_char_size(bytes3[0]),3);
242		}
243	#[test]
244	fn test_unescape_from_bytes(){
245			let re0_bytes = br"/\d{4}-\d{2}-\d{2}/";//raw byte string literal does not escape for `\`, so equivalent to the following line
246			//let re0_bytes = b"/\\d{4}-\\d{2}-\\d{2}/";
247			let re0_unescaped = unescape_from_bytes(&re0_bytes[1..18],&[b'/']);
248			let re0_unescaped_unwrapped = re0_unescaped.unwrap();
249			let is_borrowed = match re0_unescaped_unwrapped {
250				Cow::Borrowed(_) => true,
251				Cow::Owned(_) => false,
252			};
253			assert!(is_borrowed);
254			assert_eq!(re0_unescaped_unwrapped.into_owned(),String::from(r"\d{4}-\d{2}-\d{2}"));
255			
256			let re1_bytes = b"/\\d{2}\\/\\d{2}\\/\\d{4}/";
257			let re1_unescaped = unescape_from_bytes(&re1_bytes[1..20],&[b'/']);
258			let re1_unescaped_unwrapped = re1_unescaped.unwrap();
259			let is_owned = match re1_unescaped_unwrapped {
260				Cow::Borrowed(_) => false,
261				Cow::Owned(_) => true,	
262			};
263			assert!(is_owned);
264			assert_eq!(re1_unescaped_unwrapped.into_owned(),String::from(r#"\d{2}/\d{2}/\d{4}"#));
265	
266			//use ## as re_delimiter great!
267			let re2_bytes = b"##(?i)ab+c##";
268			let re2_unescaped = unescape_from_bytes(&re2_bytes[2..10],&[b'#',b'#']);
269			assert_eq!(re2_unescaped.unwrap().into_owned(),String::from("(?i)ab+c"));
270	
271			//test a reliteral containing an escaped delimiter
272			let re3_bytes = b"##(?i)\\##ab+c##";
273			let re3_unescaped = unescape_from_bytes(&re3_bytes[2..13],&[b'#',b'#']);
274			assert_eq!(re3_unescaped.unwrap().into_owned(),String::from("(?i)##ab+c"));
275
276			//test when provoking a utf-8 sequence error
277			let re3_bytes = vec![0, 159];
278			let re3_unescaped = unescape_from_bytes(&re3_bytes[..],&[b'/']);
279			assert!(re3_unescaped.is_err());
280		}
281
282
283		#[test]
284		fn test_unescape(){
285			let re1_bytes = "/\\d{2}\\/\\d{2}\\/\\d{4}/";
286			let re1_unescaped = unescape(&re1_bytes[1..20],"/");
287			let re1_unescaped_unwrapped = re1_unescaped.unwrap();
288			let is_owned = match re1_unescaped_unwrapped {
289				Cow::Borrowed(_) => false,
290				Cow::Owned(_) => true,	
291			};
292			assert!(is_owned);
293			assert_eq!(re1_unescaped_unwrapped.into_owned(),String::from(r#"\d{2}/\d{2}/\d{4}"#));
294		}
295
296		#[test]
297		fn test_undelimit(){
298			let re1_bytes = "/\\d{2}\\/\\d{2}\\/\\d{4}/";
299			let re1_undelimited = undelimit(re1_bytes,"/");
300			assert_eq!(re1_undelimited.unwrap(),String::from(r#"\d{2}/\d{2}/\d{4}"#));
301		}
302
303		
304		
305		#[test]
306		fn test_escape(){
307			let delimiter_str = "/";
308			let re0 = r"\d{4}-\d{2}-\d{2}";
309			let escaped0 = escape(re0,delimiter_str);
310			assert_eq!(escaped0.as_ref(),r"\d{4}-\d{2}-\d{2}");
311
312			// an regex example that includes delimiter(forward slash `/`)
313			let re1 = r"\d{2}/\d{2}/\d{4}";
314			let escaped1 = escape(re1,delimiter_str);
315			assert_eq!(escaped1.as_ref(),r"\d{2}\/\d{2}\/\d{4}");			
316
317		}
318
319		#[test]
320		fn test_escape_into_bytes(){
321			let re1 = r"\d{2}/\d{2}/\d{4}";
322			let delimiter_str = "/";
323			let escaped = escape_into_bytes(re1,delimiter_str);
324			assert_eq!(escaped.as_ref(),br"\d{2}\/\d{2}\/\d{4}");
325			
326		}
327		//todo: test_delimit
328		#[test]
329		fn test_delimit(){
330			let mut delimiter_str = "/";
331			// an regex example that includes delimiter(forward slash `/`)
332			let re1 = r"\d{2}/\d{2}/\d{4}";
333			let delimited1 = delimit(re1,delimiter_str);
334			let string1 = String::from(r"/\d{2}\/\d{2}\/\d{4}/");
335			assert_eq!(delimited1,string1);
336
337			delimiter_str = "#";
338			let delimited2 = delimit(re1,delimiter_str);
339			let string2 = String::from(r"#\d{2}/\d{2}/\d{4}#");
340			assert_eq!(delimited2,string2);
341
342			let re_tag_crate = r"(?-u:#[\w+-\.]+)";
343			let delimited_re_tag_crate_1 = delimit(re_tag_crate,delimiter_str);
344			let string3 = String::from(r"#(?-u:\#[\w+-\.]+)#");
345			assert_eq!(delimited_re_tag_crate_1,string3);
346		}
347
348		#[test]
349		fn test_unescape_from_bytes_chinese() {
350			//let mytext = r"\/天下\/一家";
351		    //let bytes = mytext.as_bytes();
352		    //println!("the bytes are:`{bytes:X?}`");
353			//the bytes are:`[0x5C,0x2F,0xE5,0xA4, 0xA9, 0xE4, 0xB8, 0x8B,0x5C,0x2F, 0xE4, 0xB8, 0x80, 0xE5, 0xAE, 0xB6]`
354
355			let bytes:[u8;16] = [0x5C,0x2F,0xE5,0xA4, 0xA9, 0xE4, 0xB8, 0x8B,0x5C,0x2F, 0xE4, 0xB8, 0x80, 0xE5, 0xAE, 0xB6];
356
357
358			let re1_unescaped = unescape_from_bytes(&bytes[..],&[b'/']);
359			let re1_unescaped_unwrapped = re1_unescaped.unwrap();
360			assert_eq!(re1_unescaped_unwrapped.into_owned(),String::from(r"/天下/一家"));
361			
362		}
363		
364}
365
366//for compiling utility into a rlib and then linclude a cargo project, check reference https://stackoverflow.com/questions/50731453/how-to-statically-link-to-an-existing-rlib
regex_literal/util.rs

regex_literal/
util.rs