1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820
/*!
# The core module of delimited literals
[`crate::delimited`] defines an enumeration type [`XRegex`] that represents a
choice from [`Regex`] struct and [`ReSequence`] structs. Its associated
functions try_from and from_str call `parse` and `compile` functions
sebsequently that convert any delimited regular expression literal
into compiled struct via a intermediate data BTreeMap<u32,Meta>.
*/
#![allow(unused)]
pub use regex_automata::{meta::{Regex,BuildError},Span,Match,PatternID,Input,Anchored};
pub use crate::error::Error;
use crate::util;
use core::cell::Cell;// the wrapper structure as a mutable pointer to bytes vector in thread local static
use alloc::collections::btree_map::BTreeMap;// the data structure for meta_map
use core::convert::TryFrom; //TryFrom trait used by 1 u32 ; 2.XRegex
use alloc::str;
use alloc::str::FromStr; //FromStr trait used by XRegex
use core::cmp::PartialEq;
use core::result::Result;
use alloc::string::String;
use core::slice::{Iter,IterMut};
use alloc::vec::{Vec,IntoIter};
//use thread_local macro reference: https://doc.rust-lang.org/std/macro.thread_local.html
thread_local! {
/// A static variable for storing regex literal delimiter (1-4 single-byte punctuations transcoded into u32 value).
pub static DELIMITER:Cell<u32> = const {Cell::new(b'/' as u32)};
}
/// the punctuations used as the delimiters in regex sets (regex union and sequence)
pub static RE_SET_DELIMITERS:[[u8;2];2] = [[b'[',b']'],[b'<',b'>']];
/// the punctuations used as the SEPARATORs among items in regex sets (regex union and sequence)
pub static RE_SEPARATOR:u8 = b',';
/// DELIMITER_CHARS is a collection of punctuations
/// for composing regular expression delimiters. It consists of the
/// following punctuations: !#$%&*+,./:;=?@^_|~-
pub static DELIMITER_CHARS: [u8;20] = [0x21u8,0x23u8,0x24u8,0x25u8,0x26u8,0x2au8,0x2bu8,0x2cu8,0x2eu8,0x2fu8,0x3au8,0x3bu8,0x3du8,0x3fu8,0x40u8,0x5eu8,0x5fu8,0x7cu8,0x7eu8,0x2du8];
// A broader range of punctuation selection in forming RE delimiters: any non-alphanumeric,
// non-backslash, non-whitespace character.
// [PCRE Delimiter Reference](https://pcre.org/original/doc/html/pcretest.html)
// Regex::new(r"^[^<>[\]\w\\\\s]+$").unwrap();
// Note: in a character class (square brackets) any character except ^, -, ] or \ is a literal.
/**
function set_delimiter customises delimiters for regex literal.
The passed delimiter value (as a byte array) consists of 1 or upto 4 single-byte
characters. It is firstly validated against [`DELIMITER_CHARS`]; and then
is converted into u32 and stored in thread_local static [`DELIMITER`] and the
function returns true if it is valid; otherwise [`DELIMITER`] is not updated
and the function reuturns false.
A regular expression literal (reliteral) is enclosed by delimiters.
A pair of forwardslashes -- "/pattern_text/" originally from the
matching operator in Perl[^1], is used as the default delimiters in reliteral,
which makes itself distinct from the other [Rust literal expressions](https://doc.rust-lang.org/reference/expressions/literal-expr.html)
Any delimiter sequence in the pattern text of reliteral is prepended
with a backslash. To avoid using too many escaped backslashe, reliteral
delimiter can be customised.
*/
pub fn set_delimiter(delimiter:&[u8]) -> bool {
let result = validate_delimiter(delimiter);
if result {
//update DELIMITER with a u32 value calculated from delimiter's byte.
DELIMITER.with(
|cell_delimiter| {
let code = bytes_to_u32(delimiter).unwrap();
cell_delimiter.set(code);
}
);
}
result
}
pub fn get_delimiter() -> Vec<u8> {
let mut bytes = vec![];
DELIMITER.with(
|cell_delimiter| {
let delimiter_u32 = cell_delimiter.get();
bytes = u32_to_bytes(delimiter_u32);
}
);
bytes
}
/**
function validate_delimiter checks delimiter against [`DELIMITER_CHARS`],
A byte sequence that consists of either a single-byte candidate puncutation or
multiple (up to 4) repetitive ones is valid for enclosing pattern text.
Note bracket style delimiters in many PCRE (Perl Compatible Regular Expressions
[^2]) engines are excluded from DELIMITER_CHARS, as they are reserved for
delimiting elements in regex sets.
Note:
* There are two styles of delimiters in PCRE: matched delimiters and
bracket-style delimiters. This Rust crate regex-literal only uses matched delimiters: single
or mutiple repeated punctuation characters (excluding quote characters: " ' `) in
enclosing regex literals. Bracket-style punctuations ([],<>,(),{}, and etc) are reserved
for regex set literals.
[^1]: <https://perldoc.perl.org/perlre>
[^2]: <https://pcre.org/original/doc/html/pcretest.html>
*/
pub fn validate_delimiter(delimiter:&[u8]) -> bool {
let delim_length = delimiter.len();
if delim_length > 4 {return false;} //limit delimiter size up to 4 bytes
let first = delimiter[0];
if DELIMITER_CHARS.iter().any(|&x| x == first) {
if delim_length > 1 { // if multiple-byte delimiter
let rest = &delimiter[1..];
rest.iter().all(|&x| x == first) //only repetitive bytes(punctuation characters) are allowed
} else {true} //else for one-byte delimiter
} else {false}
}
/// Convert a bytes array (1-4 bytes) into a u32 value.
fn bytes_to_u32(bytes:&[u8]) -> Option<u32> {
let byte_length = bytes.len();
if byte_length > 4 {None}
else {
let mut code:u32 = 0;
for (i, byte) in bytes.iter().enumerate(){
let increment = *byte as u32;
let left_shifted = ((byte_length - i - 1)*8) as u32;
code += increment << left_shifted;
}
Some(code)
}
}
///convert a u32 value into a byte array with the prefix 0 trimmed off.
fn u32_to_bytes(c:u32) -> Vec<u8> {
let mut rt:Vec<u8> = Vec::new();
let mut to_be_trimmed = true;
for code in c.to_be_bytes() {
if (to_be_trimmed){
if (code != 0){
to_be_trimmed = false;
rt.push(code);
}
//skipping 0
}else {
rt.push(code);
}
}
if rt.is_empty() {rt.push(0)}
rt
}
/// ReSequence is the sequence of regex_automata::Regex (can be either
/// single-pattern or multiple-pattern) that can be utilized in a timeline
/// /series of matching events.
#[derive(Debug,Clone)] //To debug a struct in Rust, you can use the Debug trait. The Debug trait provides a way to format the output of a struct in a programmer-facing, debugging context
pub struct ReSequence(Vec<Regex>);
impl ReSequence {
/// Construct a new, empty `ReSequence`
/// The Regex vector as resequence's field 0 will not be allocated in initialization
#[inline]
pub const fn new() -> Self { // need transfrom like this https://docs.rs/regex/latest/src/regex/regexset/string.rs.html? No
ReSequence(Vec::new())
}
/// Appends an element to the back of a collection.
#[inline]
pub fn push(&mut self, elem:Regex) {
self.0.push(elem);
}
/// return the number of elements in ReSequence struct.
#[inline]
pub fn len(&self) -> usize {
self.0.len()
}
/// tell if the ReSequence struct is empty or not
#[inline]
pub fn is_empty(&self) -> bool {
self.0.is_empty()
}
// https://users.rust-lang.org/t/newtype-pattern-for-vec-how-to-implement-iter/52653/2
/// Returns an iterator over the slice.
///
/// The iterator yields all items from start to end
#[inline]
pub fn iter(&self) -> Iter<'_, Regex> {
self.0.iter()
}
/// Returns an iterator that allows modifying each value.
/// The iterator yields all items from start to end.
#[inline]
pub fn iter_mut(&mut self) -> alloc::slice::IterMut<'_, Regex> {
self.0.iter_mut()
}
/// Removes the last element from a vector and returns it, or None if it is empty.
#[inline]
pub fn pop(&mut self) -> Option<Regex> {
self.0.pop()
}
/// Resequence is used as slice.
#[inline]
pub fn as_slice(&self) -> &[Regex] {
self.0.as_slice()
}
}
//Implementing Iterator https://doc.rust-lang.org/std/iter/index.html
//ref example https://stackoverflow.com/questions/34733811/what-is-the-difference-between-iter-and-into-iter
//ref example https://stackoverflow.com/questions/30218886/how-to-implement-iterator-and-intoiterator-for-a-simple-struct
/// The method that converts ReSequence into an Iterator, which works with `for` syntax.
#[cfg(not(no_global_oom_handling))]
impl IntoIterator for ReSequence {
type Item = Regex;
type IntoIter = IntoIter<Self::Item>;
#[inline]
fn into_iter(self) -> Self::IntoIter {
self.0.into_iter()
}
}
/// Create a value from an iterator.
//reference https://doc.rust-lang.org/std/iter/trait.FromIterator.html
#[cfg(not(no_global_oom_handling))]
impl FromIterator<Regex> for ReSequence {
#[inline]
fn from_iter<I: IntoIterator<Item = Regex>>(iter: I) -> Self {
let mut c = ReSequence::new();
for i in iter {
c.push(i);
}
c
}
}
/// identifiers for regex literal kinds
#[derive(Debug,Clone)]
pub enum LiteralForm{
///literal representation of [`crate::ReSequence`]
ReS,
///literal representation of [`Regex`] that holds multiple patterns
ReU,
///literal representation of [`Regex`] that holds one pattern
Re,
}
/// a collection of regular expression data artifacts
#[derive(Debug, Clone)]
pub struct XRegex {
pub literal:(u32,String),//item 0 re_puncts, item 1 literal string (whitespace trimmed during parsing)
pub data:ReSequence,
pub kind:LiteralForm,
}
//Compare two XRegex structs, reference https://doc.rust-lang.org/std/cmp/trait.PartialEq.html
impl PartialEq for XRegex {
fn eq(&self, other: &Self) -> bool {
self.literal.0 == other.literal.0 && self.literal.1 == other.literal.1
}
}
//TODO: XRegex use some design pattern in future? https://rust-unofficial.github.io/patterns/patterns/behavioural/strategy.html
/// convert reliteral bytes to XRegex
impl TryFrom<&[u8]> for XRegex {
type Error = Error;//crate::error::Error
fn try_from(value: &[u8]) -> Result<Self,Self::Error> {
DELIMITER.with(
|cell_delimiter| {
let delimiter_u32 = cell_delimiter.get();
let re_puncts:Vec<u8> = u32_to_bytes(delimiter_u32);
let metadata = parse(value,&re_puncts[..])?;
compile(value,metadata,&re_puncts[..])
}
)
}
}
//TODO: other construction option:read file as binary into utf-8 string: use std::fs; String::from_utf8_unchecked(&fs::read("address.txt")?)
//https://doc.rust-lang.org/std/str/trait.FromStr.html
/// convert reliteral string (&str) into XRegex
impl FromStr for XRegex {
type Err = Error;
fn from_str(value:&str) -> Result<Self,Self::Err> {
Self::try_from(value.as_bytes())
}
}
impl XRegex {
/// XRegex constructor creates XRegex struct with a customised re_puncts
pub fn new(re_text:&str,re_puncts:&[u8]) -> Result<Self,Error> {
if !validate_delimiter(re_puncts) {
let mut msg = "Invalid delimiter:".to_owned();
let msg_body = match str::from_utf8(re_puncts) {
Err(_) => "non-utf8-code",
Ok(puncts_str) => puncts_str,
};
msg.push_str(msg_body);
Err(Error::Syntax(msg))
}else {
let reliteral = re_text.as_bytes();
let metadata = parse(reliteral,re_puncts)?;
compile(reliteral,metadata,re_puncts)
}
}
/// check if XRegex is Resequence or not
pub fn is_seq(&self) -> bool {
matches!(self.kind,LiteralForm::ReS)
}
/// get regex from XRegex struct. None is returned if it is not item kind.
pub fn get_regex(&mut self) -> Option<Regex> { //https://stackoverflow.com/questions/29662807/how-do-you-borrow-a-mutable-reference-during-a-match
if self.is_seq() {None}
else {self.data.pop()}
}
/// get regex sequence from XRegex struct. None is returned if it is not seq kind.
pub fn as_slice(&self) -> Option<&[Regex]> {
if self.is_seq() {
Some(self.data.as_slice())
}else {None}
}
/// tell if its data is empty
pub fn is_empty(&self) -> bool {
self.data.is_empty()
}
}
//may develop a WASM version of `Meta`. memory allocation for wasm stack memory: 4G
//WebAssembly linear memory objects have sizes measured in pages. Each page is 65536 (2^16) bytes. In WebAssembly version 1, a linear memory can have at most 65536 pages, for a total of 2^32 bytes (4 gibibytes).
//https://stackoverflow.com/questions/40417774/memory-limits-in-webassembly#:~:text=1%20current%20WebAssembly%20implementations%20follow%20a%2032bit%20addressing,pages%20as%20something%20more%20%22safe%22%20than%20desktop%20applications.
/// intermediate data in between parse and compile stages when converting reliteral to XRegex
struct Meta {
/// kind is in accordance with the variants in XRegex
kind:LiteralForm,
//the content range with opening and closing delimiters excluded
range:(u32,u32),
// an optional list containing the children indices
children:Option<Vec<u32>>,
}
impl Meta {
/// add child meda index to the meta children list
fn add_child(&mut self,child_index:u32) {
match &mut self.children { //match &mut self.children ??
Some(ref mut vec) => {vec.push(child_index);}, // &mut, ref mut are omitted as the compiler can infer them
//&mut Some(ref mut vec) => {vec.push(child_index)},
//https://stackoverflow.com/questions/29662807/how-do-you-borrow-a-mutable-reference-during-a-match
None => {self.children = Some(<Vec<u32>>::from([child_index]));},
//&mut None => {self.children = Some(<Vec<u32>>::from([child_index]));},
}
}
/// appoint the meta's closing range
fn finalise(&mut self, right_range:u32){
self.range.1 = right_range;
}
}
/// construct meta_re provided with the starting delimiter, the closing delimiter, and re_delimiter length
fn create_meta_re(start:usize,end:usize) -> Meta {
Meta{kind:LiteralForm::Re,range:(start as u32,end as u32),children:None}
}
/// analyse reliteral (the byte form of regex literal) with the preset
/// re_puncts (the byte form of regex delimiter), a tuple of root meta
/// index in reliteral and meta_map is returned when the execution is successful.
/// The max reliteral length is set to 32 bits, which makes the produced keys
/// in meta_map (BTreeMap) is confined to u32.
fn parse(reliteral: &[u8],re_puncts: &[u8]) -> Result <(u32,BTreeMap<u32,Meta>),Error>{
//metaMap stores all the meta data of literal forms and their indices
let mut meta_map:BTreeMap<u32,Meta> = BTreeMap::new();
let target_size = reliteral.len();
let reliteral_bytes_fitting_in_u32 = u32::try_from(target_size);
if let Err(err) = reliteral_bytes_fitting_in_u32 {
return Err(Error::Syntax(format!("Invalid reliteral as its size exceeds the limit of 2^32 bytes: {err}")));
}
let re_delimiter_length = re_puncts.len();
//walk through all bytes of reliteral
let mut index:usize = 0;
let start = util::offset_ws(reliteral,index);
index = start;
let walk_over = proceed(&mut index,reliteral,re_puncts,&mut meta_map);
if walk_over {
let end = util::offset_ws(reliteral,index);
if end != target_size {
return Err(Error::Syntax(format!("Invalid reliteral - an unparsed tail from byte index {end}.")));
}
if meta_map.is_empty() {
return Err(Error::Syntax("Invalid reliteral - no meta data has been parsed.".to_owned()))
}
Ok((start as u32,meta_map))
} else {
Err(Error::Syntax("Unrecognized reliteral format!".to_owned()))
}
}
/// proceed reliteral while iterating its byte index i with the provided re_puncts for producing meta_map
fn proceed(i:&mut usize,reliteral: &[u8],re_puncts: &[u8],meta_map:&mut BTreeMap<u32,Meta>) -> bool {
store_re(None,i,reliteral,re_puncts,meta_map) ||
store_reu(None,i,reliteral,re_puncts,meta_map) ||
store_res(None,i,reliteral,re_puncts,meta_map)
}
// pc_index parent closure index
fn store_re(pc_index:Option<u32>,i:&mut usize,reliteral: &[u8],re_puncts: &[u8],meta_map:&mut BTreeMap<u32,Meta>) -> bool {
let j = *i;
let this_index = j as u32;
let re_delimiter_length = re_puncts.len();
if let Some(indices) = find_re_range(j,reliteral,re_puncts) {
meta_map.insert(this_index,create_meta_re(indices[0],indices[1]));//create and insert re item
if let Some(pos) = pc_index { // add the index to its parent meta when it exists
if let Some(meta) = meta_map.get_mut(&pos) {
meta.add_child(this_index);
} else {
return false;//throw error when the parent item can't be located by index in meta_map
}
}
*i = indices[1] + re_delimiter_length;
true
}else {false}
}
fn store_reu(pc_index:Option<u32>,i:&mut usize,reliteral: &[u8],re_puncts: &[u8],meta_map:&mut BTreeMap<u32,Meta>) -> bool {
let start = *i;
let this_index = start as u32;
if (reliteral[start] == RE_SET_DELIMITERS[0][0]){ //matching ReUnion delimiter
*i += 1;
let mut this_meta = Meta{kind:LiteralForm::ReU,range:(*i as u32,*i as u32),children:None};
meta_map.insert(this_index,this_meta);
*i = util::offset_ws(reliteral,*i);//stepping over whitespace characters
if !store_re(Some(this_index),i,reliteral,re_puncts,meta_map) {
return false;
}
//make sure getting first item
let mut proceeding = true;
while proceeding { //iteratively collecting RE SEPARATOR and item
*i = util::offset_ws(reliteral,*i);//stepping over whitespace characters after the re item
if reliteral[*i] == RE_SEPARATOR { //matching the SEPARATOR character
*i += 1; //stepping RE_SEPARATOR
*i = util::offset_ws(reliteral,*i);//stepping over whitespace characters after RE_SPERATOR
if !store_re(Some(this_index),i,reliteral,re_puncts,meta_map) {
return false;
}//the function returns a false abnormality when no item follows RE_SEPARATOR
}else {proceeding = false;}
}
if reliteral[*i] == RE_SET_DELIMITERS[0][1] { //matching the closing delimiter of ReU
//assign i to the right range
if let Some(this_meta) = meta_map.get_mut(&this_index){
this_meta.finalise(*i as u32);
}
if let Some(pos) = pc_index {
if let Some(meta) = meta_map.get_mut(&pos) {
meta.add_child(this_index);
} else {
return false;//unable to locate the parent item by index in meta_map
}
}
*i += 1; //stepping over the closing RE_SET_DELIMITER
return true;
}
} //end of if matching ReUnion delimiter
false
}
fn store_res(pc_index:Option<u32>,i:&mut usize,reliteral: &[u8],re_puncts: &[u8],meta_map:&mut BTreeMap<u32,Meta>) -> bool {
let start = *i;
let this_index = start as u32;
if (reliteral[start] == RE_SET_DELIMITERS[1][0]){ //matching ReSequence delimiter
*i += 1;
let this_meta = Meta{kind:LiteralForm::ReS,range:(*i as u32,*i as u32),children:None};
meta_map.insert(this_index,this_meta);
*i = util::offset_ws(reliteral,*i);//stepping over whitespace characters
if !store_re(Some(this_index),i,reliteral,re_puncts,meta_map)
&& !store_reu(Some(this_index),i,reliteral,re_puncts,meta_map) {
return false;
}
//make sure getting first item
let mut proceeding = true;
while proceeding { //iteratively collecting RE SEPARATOR and item
*i = util::offset_ws(reliteral,*i);//stepping over whitespace characters after the re item
if reliteral[*i] == RE_SEPARATOR { //matching the SEPARATOR character
*i += 1; //stepping RE_SEPARATOR
*i = util::offset_ws(reliteral,*i);//stepping over whitespace characters after RE_SPERATOR
if !store_re(Some(this_index),i,reliteral,re_puncts,meta_map) &&
!store_reu(Some(this_index),i,reliteral,re_puncts,meta_map) {
return false;
}
}else {proceeding = false;}
}
if reliteral[*i] == RE_SET_DELIMITERS[1][1] { //matching the closing delimiter of ReS
//assign i to the right range
if let Some(this_meta) = meta_map.get_mut(&this_index){
this_meta.finalise(*i as u32);
}
if let Some(pos) = pc_index {
if let Some(meta) = meta_map.get_mut(&pos) {
meta.add_child(this_index);
} else {
return false;//unable to locate the parent item by index in meta_map
}
}
*i += 1; //stepping over the closing RE_SET_DELIMITER
return true;
}
} //end of if matching ReSequence delimiter
false
} //end of fn store_res
/// Construct regex from meta_ref, save it into pool, and return literal back
fn compile_re(source:&[u8],pool:&mut ReSequence,meta_ref:&Meta,re_puncts:&[u8]) -> Result<String,Error> {
let start = meta_ref.range.0 as usize;
let stop = meta_ref.range.1 as usize;
let regex = regex_from_delimited_literal(&source[start..stop],re_puncts)?;
pool.push(regex);//store to pool data
let re_puncts_length = re_puncts.len();
let full_start = start-re_puncts_length;
let full_stop = stop+re_puncts_length;
let postback_bytes = &source[full_start..full_stop];
match str::from_utf8(postback_bytes) {
Ok(postback) => Ok(postback.to_owned()) ,
Err(error) => Err(Error::from_utf8_error(error,full_start)) ,
}
}
/// construct regexset from meta_ref and meta_map, save it into pool, and return literal
fn compile_reu(source:&[u8],pool:&mut ReSequence, meta_ref:&Meta,meta_map:&BTreeMap<u32,Meta>,re_puncts:&[u8])
-> Result<String,Error> {
let range:[u32;2] = [meta_ref.range.0,meta_ref.range.1];
if let Some(ref children_indices) = meta_ref.children {
//an alternative way is to use children_indices.iter().map (|x| ...)
let mut re_union:Vec<&[u8]> = Vec::new(); //store regex patterns (without delimiters) into regex union
//refactor re_bytes to re_literals
let mut re_literals:Vec<&str> = Vec::new(); //store regex literals (including regex delimiters) into regex literal
let puncts_length = re_puncts.len();
//an array of u8 vector is needed to be constructed beforehand
//https://stackoverflow.com/questions/70510299/how-to-declare-a-static-array-of-vectors#:~:text=If%20you%20know%20the%20size%20of%20the%20%22vec%22,This%20lets%20you%20write%20something%20like%20the%20following%3A
for child_start_index in children_indices.iter() {
if let Some(child_meta_ref) = meta_map.get(child_start_index){
let start = child_meta_ref.range.0 as usize;
let stop = child_meta_ref.range.1 as usize;
let full_start = start - puncts_length;
let full_stop = stop + puncts_length;
let re_item = &source[start..stop];
re_union.push(re_item);
match str::from_utf8(&source[full_start..full_stop]) { //get literal
Ok(re_literal) => re_literals.push(re_literal),
Err(err) => return Err(Error::from_utf8_error(err,full_start)),
}
} else { //the case that child meta is not found by the child_start_index
return Err(Error::Syntax(
format!("The literal of ReU (RegexUnion) ranging {range:?} does not have valid Regex item at byte index {child_start_index}.")
));
}
}
if re_union.is_empty() {
return Err(Error::Syntax(
format!("The literal for ReU (RegexUnion) ranging {range:?} contains 0 regex item.")
));
}
let regexset = regexset_from_delimited_literals(&re_union[..],re_puncts)?;
pool.push(regexset);
let mut postback_string = String::from("");
postback_string.push(char::from_u32(RE_SET_DELIMITERS[0][0] as u32).unwrap());
postback_string.push_str(&re_literals.join(str::from_utf8(&[RE_SEPARATOR]).unwrap()));
postback_string.push(char::from_u32(RE_SET_DELIMITERS[0][1] as u32).unwrap());
Ok(postback_string)
} else { //when meta_ref.children is None
Err(Error::Syntax(
format!("The literal for ReU (Regex Union) ranging {range:?} does not have valid Regex item.")
))
}
}
/// construct a vector of regex data from meta_ref and meta_map, save it to pool and post back literal
fn compile_res(source:&[u8],pool:&mut ReSequence, meta_ref:&Meta,meta_map:&BTreeMap<u32,Meta>,re_puncts:&[u8]) ->
Result<String,Error> {
let range:[u32;2] = [meta_ref.range.0,meta_ref.range.1];
if let Some(ref children_indices) = meta_ref.children {
let sequence_length = children_indices.len();
let mut literal_seq:Vec<String> = Vec::with_capacity(sequence_length);
for child_start_index in children_indices.iter() {
if let Some(child_meta_ref) = meta_map.get(child_start_index){
match child_meta_ref.kind {
LiteralForm::Re => {
let re = compile_re(source,pool,child_meta_ref,re_puncts)?;
literal_seq.push(re);
},
LiteralForm::ReU => {
let reu = compile_reu(source,pool,child_meta_ref,meta_map,re_puncts)?;
literal_seq.push(reu);
},
_ => {
return Err(Error::Syntax(
format!("The literal of ReS (ReSequence) ranging {range:?} has encountered an unhandled meta kind at index {child_start_index}.")
)
);
},
}
} else { //child meta is not found by child_start_index
return Err(Error::Syntax(
format!("Within ReS (ReSequence) ranging {range:?}, the Regex item cannot be located by its byte index {child_start_index}.")
));
}
}
if literal_seq.is_empty() {
return Err(Error::Syntax(
format!("The literal for ReS (ReSequence) ranging {range:?} contains 0 Regex item.")
));
}
let mut postback_string = String::from("");
postback_string.push(char::from_u32(RE_SET_DELIMITERS[1][0] as u32).unwrap());
let joined = &literal_seq[..].join(str::from_utf8(&[RE_SEPARATOR]).unwrap()); //to be tested
postback_string.push_str(&literal_seq.join(&joined[..]));
postback_string.push(char::from_u32(RE_SET_DELIMITERS[1][1] as u32).unwrap());
Ok(postback_string)
} else {
Err(Error::Syntax(
format!("The literal for ReS (Regex Sequence) positioned ranging {range:?} has zero Regex item.")
))
}
}
/// the method constructs XRegex data given reliteral source and the parsed metadata
fn compile(source:&[u8],parsed:(u32,BTreeMap<u32,Meta>),re_puncts:&[u8]) -> Result<XRegex, Error> {
let re_delimiter_length = re_puncts.len();
let index = parsed.0;
let meta_map = &(parsed.1);
if let Some(meta_ref) = meta_map.get(&index) {
let mut pool:ReSequence = ReSequence::new();
let mut pool_ref = &mut pool;
let delimiter = bytes_to_u32(re_puncts).ok_or(Error::Syntax("Failed in delimiter transcoding.".to_owned()))?;//double check
match meta_ref.kind {
LiteralForm::Re => {
let re = compile_re(source,pool_ref,meta_ref,re_puncts)?;
Ok(XRegex{data:pool,literal:(delimiter,re),kind:LiteralForm::Re})
},
LiteralForm::ReU => {
let reu = compile_reu(source,pool_ref,meta_ref,meta_map,re_puncts)?;
Ok(XRegex{data:pool,literal:(delimiter,reu),kind:LiteralForm::ReU})
},
LiteralForm::ReS => {
let res = compile_res(source,pool_ref,meta_ref,meta_map,re_puncts)?;
Ok(XRegex{data:pool,literal:(delimiter,res),kind:LiteralForm::ReS})
},
}
}
else {
Err(Error::Syntax(format!("No meta data indexed at {index} in meta_map.")))
}
}
/// Construct regex from a delimited literal. @todo ,refactor it for storing original escaped characters
fn regex_from_delimited_literal(rebody:&[u8],delimiter:&[u8]) ->
Result<Regex,Error> {
let unescaped = match util::unescape_from_bytes(rebody,delimiter){
Ok(text) => text,
Err(err_info) => return Err(Error::Syntax(err_info)),
};
Regex::new(&unescaped[..]).map_err(Error::from_meta_build_error)
}
/// Construct regex from an array of delimited literals.
fn regexset_from_delimited_literals(rebodies:&[&[u8]],delimiter:&[u8]) ->
Result<Regex,Error> {
let mut vec = Vec::new();//for storing reliterals (String type) representing Re
for bytes_ref in rebodies.iter() {
let unescaped = match util::unescape_from_bytes(bytes_ref,delimiter){
Ok(text) => text,
Err(err_info) => return Err(Error::Syntax(err_info)),
};
vec.push(unescaped.into_owned());
}
//get the references from iterator, following example:https://doc.rust-lang.org/std/vec/struct.Vec.html#method.iter
let mut ref_vec = Vec::new();
let vec_refs = &vec;
for bytes_ref in vec_refs.iter(){ //iter() iterates over &String (= &str)
ref_vec.push(bytes_ref);
}
Regex::new_many(&ref_vec).map_err(Error::from_meta_build_error)
}
/// Given the starting index i in reliteral bytes, the function is to find
/// the content range of in reliteral in between the pair of delimiters
/// (represented as re_puncts in UTF-8 bytes).
/// The boundary indices are returned if found; otherwise, `None` is returned.
fn find_re_range(i:usize,reliteral:&[u8],re_puncts:&[u8]) -> Option<[usize;2]> {
let re_delimiter_length = re_puncts.len();
let target_length = reliteral.len();
let mut result:[usize;2] = [0;2];
let mut k = i;
let mut step:usize = 0;
let first_slice_end = k + re_delimiter_length;
//the following condition ensures (1) there is valid content length besides the pair of re_delimiters (2) the opening re_puncts has been caught
if target_length > (k + 2 * re_delimiter_length) && &reliteral[k..first_slice_end] == re_puncts {
result[0] = k + re_delimiter_length;
step = re_delimiter_length;
}else {return None;}
let mut escaped = false;
while step > 0 {
k += step;
if (k + re_delimiter_length) > target_length {
return None;//the closing re_delimiter is not found till the end of bytes
}
let code = reliteral[k];
let char_length = util::infer_char_size(code);
match char_length {
0 => {
println!("invalid UTF code is found at index {}",k);
return None;
},
1 => {
if code == b'\\' {
escaped = !escaped;
}else {
if !escaped {
if &reliteral[k..(k + re_delimiter_length)] == re_puncts { // the cadidate characters used by re_delimiter are 1-byte character only
result[1] = k;
return Some(result);
}
}else {escaped = false;}
}
},
_ => {
if escaped {
escaped = false;
}
},
} // end of match
step = char_length as usize;
} //end of while loop
None
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_find_re_range(){
let re_bytes = "/(?i)\\/ab+c\\//".as_bytes();
let range = find_re_range(0,re_bytes,&[b'/']);
assert_eq!(range.unwrap(),[1,13]);
}
#[test]
fn test_regex_from_delimited_literal(){ //reviewing up to here
let re0 = regex_from_delimited_literal(br"(?i)ab+c\/",&[b'/']).unwrap();//re_delimiter `/` is presented in escaped in re0
assert!(re0.is_match("ABBBC/"));//this assertion test is conducted in the module level.
}
#[test]
fn test_regexset_from_delimited_literals(){
let my_text = "ABBBC abc123";
//let reunion_str = "[/(?i)ab+c/,/(?u)\\w+D+/]";
let item0 = br"(?i)ab+c";
let item1 = br"\d+";
//https://stackoverflow.com/questions/64309656/how-to-convert-a-rust-array-to-pointer
//https://www.hackertouch.com/how-to-print-type-of-variable-in-rust.html
//note pass `&reunion_item0[..]` as &[u8], while pass &reunion_item0 as [u8;8]
let reunion = [&item0[..],&item1[..]];
let my_set = regexset_from_delimited_literals(&reunion,&[b'/']).unwrap();
let matches:Vec<Match> = my_set.find_iter(my_text).collect();
assert_eq!(matches,vec![Match::must(0,0..5),Match::must(0,6..9),Match::must(1,9..12)]);
}
}