1 #![allow(unused)]
11
12pub use regex_automata::{meta::{Regex,BuildError},Span,Match,PatternID,Input,Anchored};
13pub use crate::error::Error;
14use crate::util;
15
16use core::cell::Cell;use alloc::collections::btree_map::BTreeMap;use core::convert::TryFrom; use alloc::str;
21use alloc::str::FromStr; use core::cmp::PartialEq;
24
25use core::result::Result;
26use alloc::string::String;
27
28use core::slice::{Iter,IterMut};
29use alloc::vec::{Vec,IntoIter};
30
31thread_local! {
33pub static DELIMITER:Cell<u32> = const {Cell::new(b'/' as u32)};
35}
36
37pub static RE_SET_DELIMITERS:[[u8;2];2] = [[b'[',b']'],[b'<',b'>']];
39pub static RE_SEPARATOR:u8 = b',';
41
42
43pub static DELIMITER_CHARS: [u8;20] = [0x21u8,0x23u8,0x24u8,0x25u8,0x26u8,0x2au8,0x2bu8,0x2cu8,0x2eu8,0x2fu8,0x3au8,0x3bu8,0x3du8,0x3fu8,0x40u8,0x5eu8,0x5fu8,0x7cu8,0x7eu8,0x2du8];
47
48pub fn set_delimiter(delimiter:&[u8]) -> bool {
75 let result = validate_delimiter(delimiter);
76 if result {
77 DELIMITER.with(
79 |cell_delimiter| {
80 let code = bytes_to_u32(delimiter).unwrap();
81 cell_delimiter.set(code);
82 }
83 );
84 }
85 result
86}
87
88pub fn get_delimiter() -> Vec<u8> {
89 let mut bytes = vec![];
90 DELIMITER.with(
91 |cell_delimiter| {
92 let delimiter_u32 = cell_delimiter.get();
93 bytes = u32_to_bytes(delimiter_u32);
94 }
95 );
96 bytes
97}
98pub fn validate_delimiter(delimiter:&[u8]) -> bool {
118 let delim_length = delimiter.len();
119 if delim_length > 4 {return false;} let first = delimiter[0];
121 if DELIMITER_CHARS.iter().any(|&x| x == first) {
122 if delim_length > 1 { let rest = &delimiter[1..];
124 rest.iter().all(|&x| x == first) } else {true} } else {false}
127}
128
129fn bytes_to_u32(bytes:&[u8]) -> Option<u32> {
131 let byte_length = bytes.len();
132 if byte_length > 4 {None}
133 else {
134 let mut code:u32 = 0;
135 for (i, byte) in bytes.iter().enumerate(){
136 let increment = *byte as u32;
137 let left_shifted = ((byte_length - i - 1)*8) as u32;
138 code += increment << left_shifted;
139 }
140 Some(code)
141 }
142}
143
144fn u32_to_bytes(c:u32) -> Vec<u8> {
146 let mut rt:Vec<u8> = Vec::new();
147 let mut to_be_trimmed = true;
148 for code in c.to_be_bytes() {
149 if (to_be_trimmed){
150 if (code != 0){
151 to_be_trimmed = false;
152 rt.push(code);
153 }
154 }else {
156 rt.push(code);
157 }
158 }
159 if rt.is_empty() {rt.push(0)}
160 rt
161}
162
163#[derive(Debug,Clone)] pub struct ReSequence(Vec<Regex>);
169
170impl ReSequence {
171 #[inline]
174 pub const fn new() -> Self { ReSequence(Vec::new())
176 }
177 #[inline]
179 pub fn push(&mut self, elem:Regex) {
180 self.0.push(elem);
181 }
182 #[inline]
184 pub fn len(&self) -> usize {
185 self.0.len()
186 }
187
188 #[inline]
190 pub fn is_empty(&self) -> bool {
191 self.0.is_empty()
192 }
193 #[inline]
198 pub fn iter(&self) -> Iter<'_, Regex> {
199 self.0.iter()
200 }
201
202 #[inline]
205 pub fn iter_mut(&mut self) -> alloc::slice::IterMut<'_, Regex> {
206 self.0.iter_mut()
207 }
208
209
210 #[inline]
212 pub fn pop(&mut self) -> Option<Regex> {
213 self.0.pop()
214 }
215
216 #[inline]
218 pub fn remove(&mut self,index: usize) -> Regex {
219 self.0.remove(index)
220 }
221
222 #[inline]
224 pub fn as_slice(&self) -> &[Regex] {
225 self.0.as_slice()
226 }
227
228pub fn matching_indices(&self,target:&str) -> Result<Vec<u16>,String> {
231
232 let data_length = self.0.len();
233 let max_index = u16::MAX as usize;
234 if data_length == 0 {Err("Unable to do matching by an empty ReSequence struct".to_string())}
235 else if max_index < data_length - 1{
236 Err("Unable to do matching as this XRegex data contains more than 2^16 regex structs".to_string())
237 }else {
238 let index_iter = self.0.iter().enumerate()
239 .filter(|&(i,regex_ref)| regex_ref.is_match(target))
240 .map(|(i,_)| i as u16);
241 let indices: Vec<u16> = index_iter.collect();
242 Ok(indices)
243 }
244 }
245}
246
247
248impl IntoIterator for ReSequence {
255 type Item = Regex;
256 type IntoIter = IntoIter<Self::Item>;
257
258 #[inline]
259 fn into_iter(self) -> Self::IntoIter {
260 self.0.into_iter()
261 }
262}
263
264impl FromIterator<Regex> for ReSequence {
268 #[inline]
269 fn from_iter<I: IntoIterator<Item = Regex>>(iter: I) -> Self {
270 let mut c = ReSequence::new();
271 for i in iter {
272 c.push(i);
273 }
274 c
275 }
276}
277
278
279
280#[derive(Debug,Clone)]
282pub enum LiteralForm{
283 ReS,
285 ReU,
287 Re,
289}
290
291#[derive(Debug, Clone)]
293pub struct XRegex {
294 pub literal:(u32,String),pub data:ReSequence,
296 pub kind:LiteralForm,
297}
298
299impl PartialEq for XRegex {
302 fn eq(&self, other: &Self) -> bool {
303 self.literal.0 == other.literal.0 && self.literal.1 == other.literal.1
304 }
305}
306
307
308impl TryFrom<&[u8]> for XRegex {
312 type Error = Error;fn try_from(value: &[u8]) -> Result<Self,Self::Error> {
314 DELIMITER.with(
315 |cell_delimiter| {
316 let delimiter_u32 = cell_delimiter.get();
317 let re_puncts:Vec<u8> = u32_to_bytes(delimiter_u32);
318 let metadata = parse(value,&re_puncts[..])?;
319 compile(value,metadata,&re_puncts[..])
320 }
321 )
322 }
323}
324
325impl FromStr for XRegex {
329 type Err = Error;
330 fn from_str(value:&str) -> Result<Self,Self::Err> {
331 Self::try_from(value.as_bytes())
332 }
333}
334
335impl XRegex {
336 pub fn new(re_text:&str,re_puncts:&[u8]) -> Result<Self,Error> {
338 if !validate_delimiter(re_puncts) {
339 let mut msg = "Invalid delimiter:".to_owned();
340 let msg_body = match str::from_utf8(re_puncts) {
341 Err(_) => "non-utf8-code",
342 Ok(puncts_str) => puncts_str,
343 };
344 msg.push_str(msg_body);
345 Err(Error::Syntax(msg))
346 }else {
347 let reliteral = re_text.as_bytes();
348 let metadata = parse(reliteral,re_puncts)?;
349 compile(reliteral,metadata,re_puncts)
350 }
351 }
352
353 pub fn is_seq(&self) -> bool {
355 matches!(self.kind,LiteralForm::ReS)
356 }
357
358 pub fn get_regex(&mut self) -> Option<Regex> {
360 if self.data.is_empty() {None}
361 else {Some(self.data.remove(0))}
362 }
363 pub fn as_regex(&self) -> Option<&Regex> { if self.is_seq() || self.data.is_empty() {None}
366 else {
367 let slice = self.data.as_slice();
368 Some(&slice[0])
369 }
370 }
371
372 pub fn as_slice(&self) -> Option<&[Regex]> {
374 if self.is_seq() {
375 Some(self.data.as_slice())
376 }else {None}
377 }
378
379 pub fn is_empty(&self) -> bool {
381 self.data.is_empty()
382 }
383
384
385}
386
387struct Meta {
393 kind:LiteralForm,
395 range:(u32,u32),
397 children:Option<Vec<u32>>,
399}
400
401
402impl Meta {
403 fn add_child(&mut self,child_index:u32) {
405 match &mut self.children { Some(ref mut vec) => {vec.push(child_index);}, None => {self.children = Some(<Vec<u32>>::from([child_index]));},
410 }
412 }
413 fn finalise(&mut self, right_range:u32){
415 self.range.1 = right_range;
416 }
417}
418fn create_meta_re(start:usize,end:usize) -> Meta {
420 Meta{kind:LiteralForm::Re,range:(start as u32,end as u32),children:None}
421}
422
423fn parse(reliteral: &[u8],re_puncts: &[u8]) -> Result <(u32,BTreeMap<u32,Meta>),Error>{
429 let mut meta_map:BTreeMap<u32,Meta> = BTreeMap::new();
431 let target_size = reliteral.len();
432 let reliteral_bytes_fitting_in_u32 = u32::try_from(target_size);
433 if let Err(err) = reliteral_bytes_fitting_in_u32 {
434 return Err(Error::Syntax(format!("Invalid reliteral as its size exceeds the limit of 2^32 bytes: {err}")));
435 }
436
437
438 let re_delimiter_length = re_puncts.len();
439 let mut index:usize = 0;
441 let start = util::offset_ws(reliteral,index);
442
443 index = start;
444 let walk_over = proceed(&mut index,reliteral,re_puncts,&mut meta_map);
445 if walk_over {
446 let end = util::offset_ws(reliteral,index);
447 if end != target_size {
448 return Err(Error::Syntax(format!("Invalid reliteral - an unparsed tail from byte index {end}.")));
449 }
450 if meta_map.is_empty() {
451 return Err(Error::Syntax("Invalid reliteral - no meta data has been parsed.".to_owned()))
452 }
453 Ok((start as u32,meta_map))
454 } else {
455 Err(Error::Syntax("Unrecognized reliteral format!".to_owned()))
456 }
457}
458
459 fn proceed(i:&mut usize,reliteral: &[u8],re_puncts: &[u8],meta_map:&mut BTreeMap<u32,Meta>) -> bool {
461 store_re(None,i,reliteral,re_puncts,meta_map) ||
462 store_reu(None,i,reliteral,re_puncts,meta_map) ||
463 store_res(None,i,reliteral,re_puncts,meta_map)
464 }
465
466 fn store_re(pc_index:Option<u32>,i:&mut usize,reliteral: &[u8],re_puncts: &[u8],meta_map:&mut BTreeMap<u32,Meta>) -> bool {
468 let j = *i;
469 let this_index = j as u32;
470 let re_delimiter_length = re_puncts.len();
471 if let Some(indices) = find_re_range(j,reliteral,re_puncts) {
472
473 meta_map.insert(this_index,create_meta_re(indices[0],indices[1]));if let Some(pos) = pc_index { if let Some(meta) = meta_map.get_mut(&pos) {
476 meta.add_child(this_index);
477 } else {
478 return false;}
480 }
481 *i = indices[1] + re_delimiter_length;
482 true
483 }else {false}
484 }
485
486 fn store_reu(pc_index:Option<u32>,i:&mut usize,reliteral: &[u8],re_puncts: &[u8],meta_map:&mut BTreeMap<u32,Meta>) -> bool {
487 let start = *i;
488 let this_index = start as u32;
489 if (reliteral[start] == RE_SET_DELIMITERS[0][0]){ *i += 1;
491 let mut this_meta = Meta{kind:LiteralForm::ReU,range:(*i as u32,*i as u32),children:None};
492 meta_map.insert(this_index,this_meta);
493 *i = util::offset_ws(reliteral,*i);if !store_re(Some(this_index),i,reliteral,re_puncts,meta_map) {
496 return false;
497 }
498 let mut proceeding = true;
500 while proceeding { *i = util::offset_ws(reliteral,*i);if reliteral[*i] == RE_SEPARATOR { *i += 1; *i = util::offset_ws(reliteral,*i);if !store_re(Some(this_index),i,reliteral,re_puncts,meta_map) {
507 return false;
508 }}else {proceeding = false;}
510 }
511 if reliteral[*i] == RE_SET_DELIMITERS[0][1] { if let Some(this_meta) = meta_map.get_mut(&this_index){
515 this_meta.finalise(*i as u32);
516 }
517
518
519 if let Some(pos) = pc_index {
520 if let Some(meta) = meta_map.get_mut(&pos) {
521 meta.add_child(this_index);
522 } else {
523 return false;}
525 }
526 *i += 1; return true;
528 }
529 } false
531 }
532
533 fn store_res(pc_index:Option<u32>,i:&mut usize,reliteral: &[u8],re_puncts: &[u8],meta_map:&mut BTreeMap<u32,Meta>) -> bool {
534 let start = *i;
535 let this_index = start as u32;
536 if (reliteral[start] == RE_SET_DELIMITERS[1][0]){ *i += 1;
538 let this_meta = Meta{kind:LiteralForm::ReS,range:(*i as u32,*i as u32),children:None};
539 meta_map.insert(this_index,this_meta);
540 *i = util::offset_ws(reliteral,*i);if !store_re(Some(this_index),i,reliteral,re_puncts,meta_map)
542 && !store_reu(Some(this_index),i,reliteral,re_puncts,meta_map) {
543 return false;
544 }
545 let mut proceeding = true;
547 while proceeding { *i = util::offset_ws(reliteral,*i);if reliteral[*i] == RE_SEPARATOR { *i += 1; *i = util::offset_ws(reliteral,*i);if !store_re(Some(this_index),i,reliteral,re_puncts,meta_map) &&
553 !store_reu(Some(this_index),i,reliteral,re_puncts,meta_map) {
554 return false;
555 }
556 }else {proceeding = false;}
557 }
558 if reliteral[*i] == RE_SET_DELIMITERS[1][1] { if let Some(this_meta) = meta_map.get_mut(&this_index){
561 this_meta.finalise(*i as u32);
562 }
563
564
565 if let Some(pos) = pc_index {
566 if let Some(meta) = meta_map.get_mut(&pos) {
567 meta.add_child(this_index);
568 } else {
569 return false;}
571 }
572
573 *i += 1; return true;
575 }
576 } false
578 } fn compile_re(source:&[u8],pool:&mut ReSequence,meta_ref:&Meta,re_puncts:&[u8]) -> Result<String,Error> {
584 let start = meta_ref.range.0 as usize;
585 let stop = meta_ref.range.1 as usize;
586 let regex = regex_from_delimited_literal(&source[start..stop],re_puncts)?;
587 pool.push(regex);let re_puncts_length = re_puncts.len();
589 let full_start = start-re_puncts_length;
590 let full_stop = stop+re_puncts_length;
591 let postback_bytes = &source[full_start..full_stop];
592 match str::from_utf8(postback_bytes) {
593 Ok(postback) => Ok(postback.to_owned()) ,
594 Err(error) => Err(Error::from_utf8_error(error,full_start)) ,
595 }
596}
597
598
599fn compile_reu(source:&[u8],pool:&mut ReSequence, meta_ref:&Meta,meta_map:&BTreeMap<u32,Meta>,re_puncts:&[u8])
601 -> Result<String,Error> {
602 let range:[u32;2] = [meta_ref.range.0,meta_ref.range.1];
603 if let Some(ref children_indices) = meta_ref.children {
604 let mut re_union:Vec<&[u8]> = Vec::new(); let mut re_literals:Vec<&str> = Vec::new(); let puncts_length = re_puncts.len();
610 for child_start_index in children_indices.iter() {
613 if let Some(child_meta_ref) = meta_map.get(child_start_index){
614
615 let start = child_meta_ref.range.0 as usize;
616 let stop = child_meta_ref.range.1 as usize;
617 let full_start = start - puncts_length;
618 let full_stop = stop + puncts_length;
619
620 let re_item = &source[start..stop];
621 re_union.push(re_item);
622
623 match str::from_utf8(&source[full_start..full_stop]) { Ok(re_literal) => re_literals.push(re_literal),
625 Err(err) => return Err(Error::from_utf8_error(err,full_start)),
626 }
627 } else { return Err(Error::Syntax(
629 format!("The literal of ReU (RegexUnion) ranging {range:?} does not have valid Regex item at byte index {child_start_index}.")
630 ));
631 }
632 }
633 if re_union.is_empty() {
634 return Err(Error::Syntax(
635 format!("The literal for ReU (RegexUnion) ranging {range:?} contains 0 regex item.")
636 ));
637 }
638
639 let regexset = regexset_from_delimited_literals(&re_union[..],re_puncts)?;
640 pool.push(regexset);
641 let mut postback_string = String::from("");
642 postback_string.push(char::from_u32(RE_SET_DELIMITERS[0][0] as u32).unwrap());
643 postback_string.push_str(&re_literals.join(str::from_utf8(&[RE_SEPARATOR]).unwrap()));
644 postback_string.push(char::from_u32(RE_SET_DELIMITERS[0][1] as u32).unwrap());
645 Ok(postback_string)
646 } else { Err(Error::Syntax(
648 format!("The literal for ReU (Regex Union) ranging {range:?} does not have valid Regex item.")
649 ))
650 }
651}
652
653fn compile_res(source:&[u8],pool:&mut ReSequence, meta_ref:&Meta,meta_map:&BTreeMap<u32,Meta>,re_puncts:&[u8]) ->
655Result<String,Error> {
656 let range:[u32;2] = [meta_ref.range.0,meta_ref.range.1];
657
658 if let Some(ref children_indices) = meta_ref.children {
659 let sequence_length = children_indices.len();
660 let mut literal_seq:Vec<String> = Vec::with_capacity(sequence_length);
661 for child_start_index in children_indices.iter() {
662 if let Some(child_meta_ref) = meta_map.get(child_start_index){
663 match child_meta_ref.kind {
664 LiteralForm::Re => {
665 let re = compile_re(source,pool,child_meta_ref,re_puncts)?;
666 literal_seq.push(re);
667 },
668 LiteralForm::ReU => {
669 let reu = compile_reu(source,pool,child_meta_ref,meta_map,re_puncts)?;
670 literal_seq.push(reu);
671 },
672 _ => {
673 return Err(Error::Syntax(
674 format!("The literal of ReS (ReSequence) ranging {range:?} has encountered an unhandled meta kind at index {child_start_index}.")
675 )
676 );
677 },
678 }
679 } else { return Err(Error::Syntax(
681 format!("Within ReS (ReSequence) ranging {range:?}, the Regex item cannot be located by its byte index {child_start_index}.")
682 ));
683 }
684 }
685 if literal_seq.is_empty() {
686 return Err(Error::Syntax(
687 format!("The literal for ReS (ReSequence) ranging {range:?} contains 0 Regex item.")
688 ));
689 }
690
691 let mut postback_string = String::from("");
692 postback_string.push(char::from_u32(RE_SET_DELIMITERS[1][0] as u32).unwrap());
693 let joined = &literal_seq[..].join(str::from_utf8(&[RE_SEPARATOR]).unwrap()); postback_string.push_str(&literal_seq.join(&joined[..]));
695 postback_string.push(char::from_u32(RE_SET_DELIMITERS[1][1] as u32).unwrap());
696 Ok(postback_string)
697
698 } else {
699 Err(Error::Syntax(
700 format!("The literal for ReS (Regex Sequence) positioned ranging {range:?} has zero Regex item.")
701 ))
702 }
703}
704
705 fn compile(source:&[u8],parsed:(u32,BTreeMap<u32,Meta>),re_puncts:&[u8]) -> Result<XRegex, Error> {
707 let re_delimiter_length = re_puncts.len();
708 let index = parsed.0;
709 let meta_map = &(parsed.1);
710 if let Some(meta_ref) = meta_map.get(&index) {
711
712 let mut pool:ReSequence = ReSequence::new();
713 let mut pool_ref = &mut pool;
714 let delimiter = bytes_to_u32(re_puncts).ok_or(Error::Syntax("Failed in delimiter transcoding.".to_owned()))?;match meta_ref.kind {
717 LiteralForm::Re => {
718 let re = compile_re(source,pool_ref,meta_ref,re_puncts)?;
719 Ok(XRegex{data:pool,literal:(delimiter,re),kind:LiteralForm::Re})
720 },
721 LiteralForm::ReU => {
722 let reu = compile_reu(source,pool_ref,meta_ref,meta_map,re_puncts)?;
723 Ok(XRegex{data:pool,literal:(delimiter,reu),kind:LiteralForm::ReU})
724 },
725 LiteralForm::ReS => {
726 let res = compile_res(source,pool_ref,meta_ref,meta_map,re_puncts)?;
727 Ok(XRegex{data:pool,literal:(delimiter,res),kind:LiteralForm::ReS})
728 },
729 }
730 }
731 else {
732 Err(Error::Syntax(format!("No meta data indexed at {index} in meta_map.")))
733 }
734 }
735
736fn regex_from_delimited_literal(rebody:&[u8],delimiter:&[u8]) ->
738Result<Regex,Error> {
739 let unescaped = match util::unescape_from_bytes(rebody,delimiter){
740 Ok(text) => text,
741 Err(err_info) => return Err(Error::Syntax(err_info)),
742 };
743 Regex::new(&unescaped[..]).map_err(Error::from_meta_build_error)
744}
745
746fn regexset_from_delimited_literals(rebodies:&[&[u8]],delimiter:&[u8]) ->
748Result<Regex,Error> {
749 let mut vec = Vec::new();for bytes_ref in rebodies.iter() {
751 let unescaped = match util::unescape_from_bytes(bytes_ref,delimiter){
752 Ok(text) => text,
753 Err(err_info) => return Err(Error::Syntax(err_info)),
754 };
755 vec.push(unescaped.into_owned());
756 }
757 let mut ref_vec = Vec::new();
759 let vec_refs = &vec;
760 for bytes_ref in vec_refs.iter(){ ref_vec.push(bytes_ref);
762 }
763 Regex::new_many(&ref_vec).map_err(Error::from_meta_build_error)
764}
765
766fn find_re_range(i:usize,reliteral:&[u8],re_puncts:&[u8]) -> Option<[usize;2]> {
771 let re_delimiter_length = re_puncts.len();
772 let target_length = reliteral.len();
773
774 let mut result:[usize;2] = [0;2];
775 let mut k = i;
776 let mut step:usize = 0;
777 let first_slice_end = k + re_delimiter_length;
778 if target_length > (k + 2 * re_delimiter_length) && &reliteral[k..first_slice_end] == re_puncts {
780 result[0] = k + re_delimiter_length;
781 step = re_delimiter_length;
782 }else {return None;}
783 let mut escaped = false;
784 while step > 0 {
785 k += step;
786 if (k + re_delimiter_length) > target_length {
787 return None;}
789 let code = reliteral[k];
790 let char_length = util::infer_char_size(code);
791 match char_length {
792 0 => {
793 println!("invalid UTF code is found at index {}",k);
794 return None;
795 },
796 1 => {
797 if code == b'\\' {
798 escaped = !escaped;
799 }else {
800 if !escaped {
801 if &reliteral[k..(k + re_delimiter_length)] == re_puncts { result[1] = k;
803 return Some(result);
804 }
805 }else {escaped = false;}
806 }
807 },
808 _ => {
809 if escaped {
810 escaped = false;
811 }
812 },
813
814 } step = char_length as usize;
816 } None
818
819}
820
821#[cfg(test)]
822mod tests {
823 use super::*;
824 #[test]
825 fn test_find_re_range(){
826 let re_bytes = "/(?i)\\/ab+c\\//".as_bytes();
827 let range = find_re_range(0,re_bytes,&[b'/']);
828 assert_eq!(range.unwrap(),[1,13]);
829 }
830
831 #[test]
832 fn test_regex_from_delimited_literal(){ let re0 = regex_from_delimited_literal(br"(?i)ab+c\/",&[b'/']).unwrap();assert!(re0.is_match("ABBBC/"));}
836
837 #[test]
838 fn test_regexset_from_delimited_literals(){
839 let my_text = "ABBBC abc123";
840 let item0 = br"(?i)ab+c";
842 let item1 = br"\d+";
843 let reunion = [&item0[..],&item1[..]];
847 let my_set = regexset_from_delimited_literals(&reunion,&[b'/']).unwrap();
848 let matches:Vec<Match> = my_set.find_iter(my_text).collect();
849 assert_eq!(matches,vec![Match::must(0,0..5),Match::must(0,6..9),Match::must(1,9..12)]);
850 }
851}