//! A collection of helper functions that process literal conversions.
//comments style reference: https://doc.rust-lang.org/stable/reference/comments.html
//! # [`delimit`] and [`undelimit`] 
//! provide text convertion between delimited and undelimited literals.


//!
//! # Others
//! [`unescape_from_bytes`], and [`unescape`] removes any escape character `/` 
//! that is prefixed to delimiter sequences in the regular expression literals.
//! [`escape`] and [`escape_into_bytes`] prepend `/` to all delimiters in input
//! texts.



use alloc::borrow::Cow; //used by unescape_from_bytes and unescape
use core::result::Result;
use alloc::string::String;


/// Use unescape_from_bytes function for removing the escape character '\' 
/// from each escaped delimiter sequence in input bytes. This returns two 
/// variants: a Cow typed string if succeeds, and an error if fails.
pub fn unescape_from_bytes<'a>(input:&'a [u8],delimiter:&[u8]) -> Result<Cow<'a, str>,String> {
	// Cow knowledge from https://doc.rust-lang.org/std/borrow/enum.Cow.html
	let mut owned = None;
	let length = input.len();
	let chunksize = delimiter.len();
	let mut i = 0;
	while i < length   {
		let thisbyte= input[i];
		if thisbyte == b'\\' { //when thisbyte is the escape character
			if (chunksize + i) < length && &input[i+1..i+1+chunksize] == delimiter{
			//peek the following bytes, and if comes the delimiter, backslash (the escape character) will be removed
				if owned.is_none() { //create owned that clones all the previous bytes [0..i] before the char `\\` 
					owned = Some(input[0..i].to_owned());	//the slices' method to_owned is equivalent to as to_vec 
				}
				owned.as_mut().unwrap().extend_from_slice(delimiter);//skipping `\\`, clone delimiter to owned
				 i += chunksize;					
			} else { //if not the delimiter, backslash is retained 
						if let Some(text) = owned.as_mut(){
							text.push(b'\\');
						} //push the escape char in the `owned` case
						//else do nothing in the `input` case (a borrowing reference)	
			}		
		}else if let Some(text) = owned.as_mut(){ //for any non-escape char, push thisbyte to `owned` if it has been created
					text.push(thisbyte);
		}
		//no else statement following `else if -- as do nothing in the `input` case (a borrowing reference) -- `
		i += 1;
	}
	let mut unescape_invalid_utf8 = String::from("Err in `unescape_from_bytes` execution:");
	match owned {
		Some(u8_vec) => {
			let source = match String::from_utf8(u8_vec){
				Ok(result_string) =>  result_string,
				Err(e) => { //details from https://doc.rust-lang.org/std/string/struct.FromUtf8Error.html
					let err_msg = format!("{e}");
					unescape_invalid_utf8.push_str(&err_msg);
					return Err(unescape_invalid_utf8);
				},
			};
			Ok(Cow::Owned(source))
		},
		None =>{
			let source = match std::str::from_utf8(input) {
				Ok(some_str) => some_str,
				Err(e) => {
					let err_msg = format!("{e}");
					unescape_invalid_utf8.push_str(&err_msg);
					return Err(unescape_invalid_utf8);
				},
			};
			Ok(Cow::Borrowed(source))
		},
	}//end of matching owned
}



/// Convert any escaped delimiter sequence in the input string slices. 
/// Use unescape function for removing the escape character '\' from each 
/// escaped delimiter sequences in input string slices. This returns two variants,
/// a Cow wrapped string if succeeds, and an error in String type if fails.
/// unescape is the reversed function of [`escape`](fn@escape) .
pub fn unescape<'a>(input:&'a str,delimiter:&str) -> Result<Cow<'a, str>,String> {
	unescape_from_bytes(input.as_bytes(),delimiter.as_bytes())
}


/// Strip off the enclosing delimiter pair and unescape delimiters from the input content.
/// undelimit is the reversed function of [`delimit`](fn@delimit) .
pub fn undelimit(input:&str, delimiter:&str) -> Result<String,String> {
	let delim_len = delimiter.len();
	let input_len = input.len();
	if input_len > 2 * delim_len {
			let start_chunk = &input[0..delim_len];
			let r_delim_index = input_len - delim_len;//the right hand delimiter index
			let end_chunk = &input[r_delim_index..input_len];
			if start_chunk == delimiter && end_chunk == delimiter {
				match unescape(&input[delim_len..r_delim_index],delimiter) {
					Ok(undelimited_cow) => Ok(undelimited_cow.into_owned()) ,
					Err(msg) => Err(msg),
				}	
			}
			else {Err(format!("The input is not delimitable by the delimiter {delimiter}."))}
	} else {
		Err(format!("The input length is shorter than the length of a pair of `{delimiter}`."))
	}	
}

/// Use escape function for escaping any delimiter sequence in regular expression text.
pub fn escape<'a>(input:&'a str,delimiter_str:&str) -> Cow<'a, str> {
	if input.find(delimiter_str) == None {Cow::Borrowed(input)}
	else {
		let mut s = String::from(input);
			//https://doc.rust-lang.org/std/primitive.str.html#method.rmatch_indices
		for element in input.rmatch_indices(delimiter_str) {
			s.insert_str(element.0,"\\")	
		}
		Cow::Owned(s)
	}		
}

/// Use escape_into_bytes function for escaping delimiter sequences and converting regular expression text into bytes
pub fn escape_into_bytes<'a>(input:&'a str,delimiter_str:&str) -> Cow<'a, [u8]> {
	let text = escape(input,delimiter_str);
	if let Cow::Owned(escaped_string) = text {
		Cow::Owned(escaped_string.into_bytes()) // collect an iterator into array,ref https://users.rust-lang.org/t/collect-into-an-array/55498
	}else {Cow::Borrowed(input.as_bytes())}
} 

/// create a delimited regular expression with the given delimiter string.
pub fn delimit(input:&str,delimiter_str:&str) -> String {
	let mut output = String::from(delimiter_str);
	let escaped = escape(input,delimiter_str);
	output.push_str(escaped.as_ref());
	output.push_str(delimiter_str);
	output
}

/// testing: creating a string list of delimited regular expressions 
pub fn delimit_many(inputs:&[&str],delimiter_str:&str) -> Vec<String> {
	let v:Vec<String> = inputs.iter().map(|x| delimit(x,delimiter_str)).collect();
	v	
}

/// check if there is any leading whitespace from a given start byte index and return the offset between start and non-whitespace character
pub fn offset_ws(reliteral:&[u8],start:usize) -> usize {
	

//pass start + offset as result, or throw a error let error

	//check the first byte, >> 4 (一个字节由两个Hex数值构成，>>4逻辑移位后得到字节里的代表前半段的Hex看是否为0, ASCII是在128以内)
	//read one byte, compare with one-byte whitespace characters
	//read three bytes, compare with three-byte whitespace characters

//       Unicode         | UTF-8 bytes
// --------------------  | ------
// 0000 0000 - 0000 007F |     1
// 0000 0080 - 0000 07FF |     2
// 0000 0800 - 0000 FFFF |     3
// 0001 0000 - 0010 FFFF |     4

// decide whether or not use utf8Error https://doc.rust-lang.org/std/str/struct.Utf8Error.html

	// the following bytes arrays are the byte representation of 25 unicode characters with property White_Space=yes ("WSpace=Y","WS"), reference: <https://en.wikipedia.org/wiki/Whitespace_character#Unicode>
			// the following shows how unicodes are converted into bytes by rust API.
			// let ws: char = '\u{3000}';
			// let ws_str = ws.to_string();
			// println!("{:x?}",ws_str.as_bytes());

	let ws_as_one_byte: [u8;6] = [0x09,0x0A,0x0B,0x0C,0x0D,0x20];//Unicode 0-7F represented in the same form as one byte - ASCII codes
	let ws_as_two_bytes: [[u8;2];2] = [[0xc2, 0x85],[0xc2, 0xa0]]; 
	let ws_as_three_bytes: [[u8;3];17] = [
		[0xe1, 0x9a, 0x80],[0xe2, 0x80, 0x80],[0xe2, 0x80, 0x81],[0xe2, 0x80, 0x82],
		[0xe2, 0x80, 0x83],[0xe2, 0x80, 0x84],[0xe2, 0x80, 0x85],[0xe2, 0x80, 0x86],
		[0xe2, 0x80, 0x87],[0xe2, 0x80, 0x88],[0xe2, 0x80, 0x89],[0xe2, 0x80, 0x8a],
		[0xe2, 0x80, 0xa8],[0xe2, 0x80, 0xa9],[0xe2, 0x80, 0xaf],[0xe2, 0x81, 0x9f],
		[0xe3, 0x80, 0x80]
	];
	let mut stepping = true;
	let mut pos = start;
	while stepping && pos < reliteral.len() {		//was line 193
		let start_byte = reliteral[pos];
		let prefix = start_byte >> 4 ;
		if prefix < 0b1000 { // 1-byte character
			stepping = ws_as_one_byte.iter().any(|&x| x == start_byte);
			if stepping {pos += 1 ;}
		} else {
			match prefix {
				0b1100  => { // two-byte whitespace characters all start with 1100
					stepping = ws_as_two_bytes.iter().any(|&x| x == reliteral[pos..(pos+2)]);
					if stepping {pos += 2;}
				},
				0b1110 => { // three-byte characters all start with 1110
					stepping = ws_as_three_bytes.iter().any(|&x| x == reliteral[pos..(pos+3)]);
					if stepping {pos += 3;}
				},
				_ => {stepping = false;}
			}
		}

	}
	pos
}


/// infer character length from 1st byte.
/// reference: [UTF-8](https://en.wikipedia.org/wiki/UTF-8)
pub fn infer_char_size (byte:u8) -> u8 {
	let prefix = byte >> 4;
	match prefix {
		0b1111 => 4u8,
		0b1110 => 3u8,
		0b1100 => 2u8,
		_ => {
			if prefix < 0b1000 {1u8} //0x7F is the last codepoint in one-byte character group
			else {0u8} //0 for encountering an invalid prefix/leading byte in UTF-8 encoding
		}
	}	
}





#[cfg(test)]
mod tests {
	use super::*;
	#[test]
		fn test_infer_char_size(){
			let string1 = '~'.to_string();
			let bytes1 = string1.as_bytes();
			let string2 = 'ß'.to_string();
			let bytes2 = string2.as_bytes();
			let string3 = '中'.to_string();
			let bytes3 = string3.as_bytes();
			assert_eq!(infer_char_size(bytes1[0]),1);
			assert_eq!(infer_char_size(bytes2[0]),2);
			assert_eq!(infer_char_size(bytes3[0]),3);
		}
	#[test]
	fn test_unescape_from_bytes(){
			let re0_bytes = br"/\d{4}-\d{2}-\d{2}/";//raw byte string literal does not escape for `\`, so equivalent to the following line
			//let re0_bytes = b"/\\d{4}-\\d{2}-\\d{2}/";
			let re0_unescaped = unescape_from_bytes(&re0_bytes[1..18],&[b'/']);
			let re0_unescaped_unwrapped = re0_unescaped.unwrap();
			let is_borrowed = match re0_unescaped_unwrapped {
				Cow::Borrowed(_) => true,
				Cow::Owned(_) => false,
			};
			assert!(is_borrowed);
			assert_eq!(re0_unescaped_unwrapped.into_owned(),String::from(r"\d{4}-\d{2}-\d{2}"));
			
			let re1_bytes = b"/\\d{2}\\/\\d{2}\\/\\d{4}/";
			let re1_unescaped = unescape_from_bytes(&re1_bytes[1..20],&[b'/']);
			let re1_unescaped_unwrapped = re1_unescaped.unwrap();
			let is_owned = match re1_unescaped_unwrapped {
				Cow::Borrowed(_) => false,
				Cow::Owned(_) => true,	
			};
			assert!(is_owned);
			assert_eq!(re1_unescaped_unwrapped.into_owned(),String::from(r#"\d{2}/\d{2}/\d{4}"#));
	
			//use ## as re_delimiter great!
			let re2_bytes = b"##(?i)ab+c##";
			let re2_unescaped = unescape_from_bytes(&re2_bytes[2..10],&[b'#',b'#']);
			assert_eq!(re2_unescaped.unwrap().into_owned(),String::from("(?i)ab+c"));
	
			//test a reliteral containing an escaped delimiter
			let re3_bytes = b"##(?i)\\##ab+c##";
			let re3_unescaped = unescape_from_bytes(&re3_bytes[2..13],&[b'#',b'#']);
			assert_eq!(re3_unescaped.unwrap().into_owned(),String::from("(?i)##ab+c"));

			//test when provoking a utf-8 sequence error
			let re3_bytes = vec![0, 159];
			let re3_unescaped = unescape_from_bytes(&re3_bytes[..],&[b'/']);
			assert!(re3_unescaped.is_err());
		}


		#[test]
		fn test_unescape(){
			let re1_bytes = "/\\d{2}\\/\\d{2}\\/\\d{4}/";
			let re1_unescaped = unescape(&re1_bytes[1..20],"/");
			let re1_unescaped_unwrapped = re1_unescaped.unwrap();
			let is_owned = match re1_unescaped_unwrapped {
				Cow::Borrowed(_) => false,
				Cow::Owned(_) => true,	
			};
			assert!(is_owned);
			assert_eq!(re1_unescaped_unwrapped.into_owned(),String::from(r#"\d{2}/\d{2}/\d{4}"#));
		}

		#[test]
		fn test_undelimit(){
			let re1_bytes = "/\\d{2}\\/\\d{2}\\/\\d{4}/";
			let re1_undelimited = undelimit(re1_bytes,"/");
			assert_eq!(re1_undelimited.unwrap(),String::from(r#"\d{2}/\d{2}/\d{4}"#));
		}

		
		
		#[test]
		fn test_escape(){
			let delimiter_str = "/";
			let re0 = r"\d{4}-\d{2}-\d{2}";
			let escaped0 = escape(re0,delimiter_str);
			assert_eq!(escaped0.as_ref(),r"\d{4}-\d{2}-\d{2}");

			// an regex example that includes delimiter(forward slash `/`)
			let re1 = r"\d{2}/\d{2}/\d{4}";
			let escaped1 = escape(re1,delimiter_str);
			assert_eq!(escaped1.as_ref(),r"\d{2}\/\d{2}\/\d{4}");			

		}

		#[test]
		fn test_escape_into_bytes(){
			let re1 = r"\d{2}/\d{2}/\d{4}";
			let delimiter_str = "/";
			let escaped = escape_into_bytes(re1,delimiter_str);
			assert_eq!(escaped.as_ref(),br"\d{2}\/\d{2}\/\d{4}");
			
		}
		//todo: test_delimit
		#[test]
		fn test_delimit(){
			let mut delimiter_str = "/";
			// an regex example that includes delimiter(forward slash `/`)
			let re1 = r"\d{2}/\d{2}/\d{4}";
			let delimited1 = delimit(re1,delimiter_str);
			let string1 = String::from(r"/\d{2}\/\d{2}\/\d{4}/");
			assert_eq!(delimited1,string1);

			delimiter_str = "#";
			let delimited2 = delimit(re1,delimiter_str);
			let string2 = String::from(r"#\d{2}/\d{2}/\d{4}#");
			assert_eq!(delimited2,string2);

			let re_tag_crate = r"(?-u:#[\w+-\.]+)";
			let delimited_re_tag_crate_1 = delimit(re_tag_crate,delimiter_str);
			let string3 = String::from(r"#(?-u:\#[\w+-\.]+)#");
			assert_eq!(delimited_re_tag_crate_1,string3);
		}

		#[test]
		fn test_unescape_from_bytes_chinese() {
			//let mytext = r"\/天下\/一家";
		    //let bytes = mytext.as_bytes();
		    //println!("the bytes are:`{bytes:X?}`");
			//the bytes are:`[0x5C,0x2F,0xE5,0xA4, 0xA9, 0xE4, 0xB8, 0x8B,0x5C,0x2F, 0xE4, 0xB8, 0x80, 0xE5, 0xAE, 0xB6]`

			let bytes:[u8;16] = [0x5C,0x2F,0xE5,0xA4, 0xA9, 0xE4, 0xB8, 0x8B,0x5C,0x2F, 0xE4, 0xB8, 0x80, 0xE5, 0xAE, 0xB6];


			let re1_unescaped = unescape_from_bytes(&bytes[..],&[b'/']);
			let re1_unescaped_unwrapped = re1_unescaped.unwrap();
			assert_eq!(re1_unescaped_unwrapped.into_owned(),String::from(r"/天下/一家"));
			
		}
		
}

//for compiling utility into a rlib and then linclude a cargo project, check reference https://stackoverflow.com/questions/50731453/how-to-statically-link-to-an-existing-rlib