1#[cfg(feature = "no_std")]
5use alloc::collections::BTreeMap as HashMap;
6
7#[cfg(not(feature = "no_std"))]
8use std::collections::HashMap;
9
10use alloc::{borrow::Cow, vec, vec::Vec};
11use core::fmt;
12
13use crate::{ctype, tree::*, PosixRegex};
14
15#[derive(Clone, Copy, PartialEq, Eq)]
17pub struct Range(pub u32, pub Option<u32>);
18impl fmt::Debug for Range {
19 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
20 match self {
21 Range(start, None) => write!(f, "{}..", start),
22 Range(start, Some(end)) => write!(f, "{}..{}", start, end),
23 }
24 }
25}
26
27#[derive(Clone, PartialEq, Eq)]
29pub enum Collation {
30 Char(u8),
31 Class(fn(u8) -> bool),
32}
33impl Collation {
34 pub fn matches(&self, other: u8, insensitive: bool) -> bool {
36 match *self {
37 Collation::Char(me) if insensitive => {
38 if ctype::is_alpha(me) && ctype::is_alpha(other) {
39 me | 32 == other | 32
40 } else {
41 me == other
42 }
43 }
44 Collation::Char(me) => me == other,
45 Collation::Class(f) => f(other),
46 }
47 }
48}
49impl fmt::Debug for Collation {
50 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
51 match *self {
52 Collation::Char(c) => write!(f, "{:?}", c as char),
53 Collation::Class(c) => write!(f, "{:p}", c),
54 }
55 }
56}
57
58#[derive(Clone, PartialEq, Eq)]
60pub enum Token {
61 InternalStart,
63
64 Alternative,
65 Any,
66 BackRef(u32),
67 Char(u8),
68 End,
69 Group(usize),
70 OneOf {
71 invert: bool,
72 list: Vec<Collation>,
73 },
74 Root,
75 Start,
76 WordEnd,
77 WordStart,
78}
79impl fmt::Debug for Token {
80 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
81 match *self {
82 Token::InternalStart => write!(f, "<START>"),
83
84 Token::Alternative => write!(f, "Alternative"),
85 Token::Any => write!(f, "."),
86 Token::BackRef(id) => write!(f, "\\{}", id),
87 Token::Char(c) => write!(f, "{:?}", c as char),
88 Token::End => write!(f, "$"),
89 Token::Group(id) => write!(f, "Group({})", id),
90 Token::OneOf { invert, ref list } => write!(f, "{{invert: {}, {:?}}}", invert, list),
91 Token::Root => write!(f, "Root"),
92 Token::Start => write!(f, "^"),
93 Token::WordEnd => write!(f, ">"),
94 Token::WordStart => write!(f, "<"),
95 }
96 }
97}
98#[derive(Clone, Debug, PartialEq, Eq)]
100pub enum Error {
101 EOF,
102 EmptyRepetition,
103 Expected(u8, Option<u8>),
104 IllegalRange,
105 IntegerOverflow,
106 InvalidBackRef(u32),
107 LeadingRepetition,
108 UnclosedRepetition,
109 UnexpectedToken(u8),
110 UnknownClass(Vec<u8>),
111 UnknownCollation,
112}
113
114pub struct PosixRegexBuilder<'a> {
116 input: &'a [u8],
117 classes: HashMap<&'a [u8], fn(u8) -> bool>,
118 group_id: usize,
119 builder: TreeBuilder,
120 extended: bool,
121}
122impl<'a> PosixRegexBuilder<'a> {
123 pub fn new(input: &'a [u8]) -> Self {
125 Self {
126 input,
127 classes: HashMap::new(),
128 group_id: 1,
129 builder: TreeBuilder::default(),
130 extended: false,
131 }
132 }
133 pub fn with_class(mut self, name: &'a [u8], callback: fn(u8) -> bool) -> Self {
135 self.classes.insert(name, callback);
136 self
137 }
138 pub fn with_default_classes(mut self) -> Self {
140 #[cfg(not(feature = "no_std"))]
141 self.classes.reserve(12);
142
143 self.classes.insert(b"alnum", ctype::is_alnum);
144 self.classes.insert(b"alpha", ctype::is_alpha);
145 self.classes.insert(b"blank", ctype::is_blank);
146 self.classes.insert(b"cntrl", ctype::is_cntrl);
147 self.classes.insert(b"digit", ctype::is_digit);
148 self.classes.insert(b"graph", ctype::is_graph);
149 self.classes.insert(b"lower", ctype::is_lower);
150 self.classes.insert(b"print", ctype::is_print);
151 self.classes.insert(b"punct", ctype::is_punct);
152 self.classes.insert(b"space", ctype::is_space);
153 self.classes.insert(b"upper", ctype::is_upper);
154 self.classes.insert(b"xdigit", ctype::is_xdigit);
155
156 self
157 }
158 pub fn extended(mut self, extended: bool) -> Self {
160 self.extended = extended;
161 self
162 }
163 pub fn compile(self) -> Result<PosixRegex<'static>, Error> {
165 let tree = self.compile_tokens()?;
166 Ok(PosixRegex::new(Cow::Owned(tree)))
167 }
168 pub fn compile_tokens(mut self) -> Result<Tree, Error> {
169 self.builder.start_internal(Token::Root, Range(1, Some(1)));
170 self.parse()?;
171 self.builder.finish_internal();
172 Ok(self.builder.finish())
173 }
174
175 fn consume(&mut self, amount: usize) {
176 self.input = &self.input[amount..];
177 }
178 fn take_int(&mut self) -> Result<Option<u32>, Error> {
179 let mut out: Option<u32> = None;
180 while let Some(&c @ b'0'..=b'9') = self.input.first() {
181 self.consume(1);
182 out = Some(
183 out.unwrap_or(0)
184 .checked_mul(10)
185 .and_then(|out| out.checked_add((c - b'0') as u32))
186 .ok_or(Error::IntegerOverflow)?,
187 );
188 }
189 Ok(out)
190 }
191 fn next(&mut self) -> Result<u8, Error> {
192 self.input
193 .first()
194 .map(|&c| {
195 self.consume(1);
196 c
197 })
198 .ok_or(Error::EOF)
199 }
200 fn expect(&mut self, c: u8) -> Result<(), Error> {
201 if self.input.first() != Some(&c) {
202 return Err(Error::Expected(c, self.input.first().cloned()));
203 }
204 self.consume(1);
205 Ok(())
206 }
207 fn parse_range(&mut self) -> Result<Range, Error> {
208 let mut range = Range(1, Some(1));
209 if let Some(&c) = self.input.first() {
210 let new = match c {
211 b'*' => Some((1, Range(0, None))),
212 b'\\' => match self.input.get(1) {
213 Some(b'?') if !self.extended => Some((2, Range(0, Some(1)))),
214 Some(b'+') if !self.extended => Some((2, Range(1, None))),
215 Some(b'{') if !self.extended => {
216 self.consume(2);
217 let first = self.take_int()?.ok_or(Error::EmptyRepetition)?;
218 let mut second = Some(first);
219 if let Some(b',') = self.input.first() {
220 self.consume(1);
221 second = self.take_int()?;
222 }
223 if self.input.first() == Some(&b'}') {
224 self.consume(1);
225 } else if self.input.starts_with(br"\}") {
226 self.consume(2);
227 } else {
228 return Err(Error::UnclosedRepetition);
229 }
230 if second.map(|second| first > second).unwrap_or(false) {
231 return Err(Error::IllegalRange);
232 }
233 range = Range(first, second);
234 None
235 }
236 _ => None,
237 },
238 b'?' if self.extended => Some((1, Range(0, Some(1)))),
239 b'+' if self.extended => Some((1, Range(1, None))),
240 b'{' if self.extended => {
241 self.consume(1);
242 let first = self.take_int()?.ok_or(Error::EmptyRepetition)?;
243 let mut second = Some(first);
244 if let Some(b',') = self.input.first() {
245 self.consume(1);
246 second = self.take_int()?;
247 }
248 if self.input.first() == Some(&b'}') {
249 self.consume(1);
250 } else if self.input.starts_with(br"\}") {
251 self.consume(2);
252 } else {
253 return Err(Error::UnclosedRepetition);
254 }
255 if second.map(|second| first > second).unwrap_or(false) {
256 return Err(Error::IllegalRange);
257 }
258 range = Range(first, second);
259 None
260 }
261 _ => None,
262 };
263 if let Some((consume, new)) = new {
264 range = new;
265 self.consume(consume);
266 }
267 }
268 Ok(range)
269 }
270 fn parse(&mut self) -> Result<(), Error> {
271 self.builder
272 .start_internal(Token::Alternative, Range(1, Some(1)));
273 while let Ok(c) = self.next() {
274 let token = match c {
275 b'^' => Token::Start,
276 b'$' => Token::End,
277 b'.' => Token::Any,
278 b'[' => {
279 let mut list = Vec::new();
280 let invert = self.input.first() == Some(&b'^');
281
282 if invert {
283 self.consume(1);
284 }
285
286 loop {
287 let mut c = self.next()?;
288
289 let mut push = true;
290
291 if c == b'[' {
292 match self.next()? {
296 b'.' => {
297 c = self.next()?;
298 self.expect(b'.')?;
299 self.expect(b']')?;
300 }
301 b'=' => {
302 c = self.next()?;
303 self.expect(b'=')?;
304 self.expect(b']')?;
305 }
306 b':' => {
307 let end = self
308 .input
309 .iter()
310 .position(|&c| c == b':')
311 .ok_or(Error::EOF)?;
312 let key = &self.input[..end];
313 let class = *self
314 .classes
315 .get(key)
316 .ok_or_else(|| Error::UnknownClass(key.to_vec()))?;
317 self.consume(end + 1);
318 self.expect(b']')?;
319
320 list.push(Collation::Class(class));
321 push = false;
322 }
323 _ => return Err(Error::UnknownCollation),
324 }
325 }
326
327 if push {
328 list.push(Collation::Char(c));
329
330 if self.input.first() == Some(&b'-') && self.input.get(1) != Some(&b']')
331 {
332 self.consume(1);
333 let dest = self.next()?;
334 for c in (c + 1)..=dest {
335 list.push(Collation::Char(c));
336 }
337 }
338 }
339
340 if self.input.first() == Some(&b']') {
341 self.consume(1);
342 break;
343 }
344 }
345
346 Token::OneOf { invert, list }
347 }
348 b'\\'
349 if self
350 .input
351 .first()
352 .map(|&c| c.is_ascii_digit())
353 .unwrap_or(false) =>
354 {
355 let id = self.take_int()?.unwrap();
356 if (id as usize) >= self.group_id {
357 return Err(Error::InvalidBackRef(id));
358 }
359 Token::BackRef(id)
360 }
361 b'\\' => match self.next()? {
362 b'(' if !self.extended => {
363 let id = self.group_id;
364 self.group_id += 1;
365 let checkpoint = self.builder.checkpoint();
366 self.parse()?;
367 let range = self.parse_range()?;
368 self.builder
369 .start_internal_at(checkpoint, Token::Group(id), range);
370 self.builder.finish_internal();
371 continue;
372 }
373 b')' if !self.extended => break,
374 b'|' if !self.extended => {
375 self.builder.finish_internal();
376 self.builder
377 .start_internal(Token::Alternative, Range(1, Some(1)));
378 continue;
379 }
380 b'<' => Token::WordStart,
381 b'>' => Token::WordEnd,
382 b'a' => Token::OneOf {
383 invert: false,
384 list: vec![Collation::Class(ctype::is_alnum)],
385 },
386 b'd' => Token::OneOf {
387 invert: false,
388 list: vec![Collation::Class(ctype::is_digit)],
389 },
390 b's' => Token::OneOf {
391 invert: false,
392 list: vec![Collation::Class(ctype::is_space)],
393 },
394 b'S' => Token::OneOf {
395 invert: true,
396 list: vec![Collation::Class(ctype::is_space)],
397 },
398 b'n' => Token::Char(b'\n'),
399 b'r' => Token::Char(b'\r'),
400 b't' => Token::Char(b'\t'),
401 c => Token::Char(c),
402 },
403 b'(' if self.extended => {
404 let id = self.group_id;
405 self.group_id += 1;
406 let checkpoint = self.builder.checkpoint();
407 self.parse()?;
408 let range = self.parse_range()?;
409 self.builder
410 .start_internal_at(checkpoint, Token::Group(id), range);
411 self.builder.finish_internal();
412 continue;
413 }
414 b')' if self.extended => break,
415 b'|' if self.extended => {
416 self.builder.finish_internal();
417 self.builder
418 .start_internal(Token::Alternative, Range(1, Some(1)));
419 continue;
420 }
421 c => Token::Char(c),
422 };
423 let range = self.parse_range()?;
424 self.builder.leaf(token, range);
425 }
426 self.builder.finish_internal();
427 Ok(())
428 }
429}
430
431#[cfg(test)]
432mod tests {
433 use super::*;
434
435 use alloc::{format, string::String};
436
437 fn compile(input: &[u8]) -> String {
438 format!(
439 "{:?}",
440 PosixRegexBuilder::new(input)
441 .with_default_classes()
442 .compile_tokens()
443 .expect("error compiling regex")
444 )
445 }
446
447 #[test]
448 fn basic() {
449 assert_eq!(
450 compile(b"abc"),
451 "\
452Root 1..1
453 Alternative 1..1
454 'a' 1..1
455 'b' 1..1
456 'c' 1..1
457"
458 );
459 }
460 #[test]
461 fn groups() {
462 assert_eq!(
463 compile(br"\(abc\|bcd\|cde\)"),
464 "\
465Root 1..1
466 Alternative 1..1
467 Group(1) 1..1
468 Alternative 1..1
469 'a' 1..1
470 'b' 1..1
471 'c' 1..1
472 Alternative 1..1
473 'b' 1..1
474 'c' 1..1
475 'd' 1..1
476 Alternative 1..1
477 'c' 1..1
478 'd' 1..1
479 'e' 1..1
480"
481 );
482 assert_eq!(
483 compile(br"\(abc\|\(bcd\|cde\)\)"),
484 "\
485Root 1..1
486 Alternative 1..1
487 Group(1) 1..1
488 Alternative 1..1
489 'a' 1..1
490 'b' 1..1
491 'c' 1..1
492 Alternative 1..1
493 Group(2) 1..1
494 Alternative 1..1
495 'b' 1..1
496 'c' 1..1
497 'd' 1..1
498 Alternative 1..1
499 'c' 1..1
500 'd' 1..1
501 'e' 1..1
502"
503 );
504 }
505 #[test]
506 fn words() {
507 assert_eq!(
508 compile(br"\<word\>"),
509 "\
510Root 1..1
511 Alternative 1..1
512 < 1..1
513 'w' 1..1
514 'o' 1..1
515 'r' 1..1
516 'd' 1..1
517 > 1..1
518"
519 );
520 }
521 #[test]
522 fn repetitions() {
523 assert_eq!(
524 compile(br"yeee*"),
525 "\
526Root 1..1
527 Alternative 1..1
528 'y' 1..1
529 'e' 1..1
530 'e' 1..1
531 'e' 0..
532"
533 );
534 assert_eq!(
535 compile(br"yee\?"),
536 "\
537Root 1..1
538 Alternative 1..1
539 'y' 1..1
540 'e' 1..1
541 'e' 0..1
542"
543 );
544 assert_eq!(
545 compile(br"yee\+"),
546 "\
547Root 1..1
548 Alternative 1..1
549 'y' 1..1
550 'e' 1..1
551 'e' 1..
552"
553 );
554 assert_eq!(
555 compile(br"ye\{2}"),
556 "\
557Root 1..1
558 Alternative 1..1
559 'y' 1..1
560 'e' 2..2
561"
562 );
563 assert_eq!(
564 compile(br"ye\{2,}"),
565 "\
566Root 1..1
567 Alternative 1..1
568 'y' 1..1
569 'e' 2..
570"
571 );
572 assert_eq!(
573 compile(br"ye\{2,3}"),
574 "\
575Root 1..1
576 Alternative 1..1
577 'y' 1..1
578 'e' 2..3
579"
580 );
581 }
582 #[test]
583 fn bracket() {
584 assert_eq!(
585 compile(b"[abc]"),
586 "\
587Root 1..1
588 Alternative 1..1
589 {invert: false, ['a', 'b', 'c']} 1..1
590"
591 );
592 assert_eq!(
593 compile(b"[^abc]"),
594 "\
595Root 1..1
596 Alternative 1..1
597 {invert: true, ['a', 'b', 'c']} 1..1
598"
599 );
600 assert_eq!(
601 compile(b"[]] [^]]"),
602 "\
603Root 1..1
604 Alternative 1..1
605 {invert: false, [']']} 1..1
606 ' ' 1..1
607 {invert: true, [']']} 1..1
608"
609 );
610 assert_eq!(
611 compile(b"[0-3] [a-c] [-1] [1-]"),
612 "\
613Root 1..1
614 Alternative 1..1
615 {invert: false, ['0', '1', '2', '3']} 1..1
616 ' ' 1..1
617 {invert: false, ['a', 'b', 'c']} 1..1
618 ' ' 1..1
619 {invert: false, ['-', '1']} 1..1
620 ' ' 1..1
621 {invert: false, ['1', '-']} 1..1
622"
623 );
624 assert_eq!(
625 compile(b"[[.-.]-/]"),
626 "\
627Root 1..1
628 Alternative 1..1
629 {invert: false, ['-', '.', '/']} 1..1
630"
631 );
632 assert_eq!(
633 compile(b"[[:digit:][:upper:]]"),
634 format!(
635 "\
636Root 1..1
637 Alternative 1..1
638 {{invert: false, [{:p}, {:p}]}} 1..1
639",
640 ctype::is_digit as fn(u8) -> bool,
641 ctype::is_upper as fn(u8) -> bool
642 )
643 );
644 }
645 #[test]
646 fn newline() {
647 assert_eq!(
648 compile(br"\r\n"),
649 "\
650Root 1..1
651 Alternative 1..1
652 '\\r' 1..1
653 '\\n' 1..1
654"
655 );
656 }
657 #[test]
658 fn backref() {
659 assert_eq!(
660 compile(br"\([abc]\)\1"),
661 "\
662Root 1..1
663 Alternative 1..1
664 Group(1) 1..1
665 Alternative 1..1
666 {invert: false, ['a', 'b', 'c']} 1..1
667 \\1 1..1
668"
669 )
670 }
671
672 fn compile_extended(input: &[u8]) -> String {
673 format!(
674 "{:?}",
675 PosixRegexBuilder::new(input)
676 .with_default_classes()
677 .extended(true)
678 .compile_tokens()
679 .expect("error compiling regex")
680 )
681 }
682
683 #[test]
684 fn basic_extended() {
685 assert_eq!(
686 compile_extended(b"abc"),
687 "\
688Root 1..1
689 Alternative 1..1
690 'a' 1..1
691 'b' 1..1
692 'c' 1..1
693"
694 );
695 }
696 #[test]
697 fn groups_extended() {
698 assert_eq!(
699 compile_extended(br"(abc|bcd|cde)"),
700 "\
701Root 1..1
702 Alternative 1..1
703 Group(1) 1..1
704 Alternative 1..1
705 'a' 1..1
706 'b' 1..1
707 'c' 1..1
708 Alternative 1..1
709 'b' 1..1
710 'c' 1..1
711 'd' 1..1
712 Alternative 1..1
713 'c' 1..1
714 'd' 1..1
715 'e' 1..1
716"
717 );
718 assert_eq!(
719 compile_extended(br"(abc|(bcd|cde))"),
720 "\
721Root 1..1
722 Alternative 1..1
723 Group(1) 1..1
724 Alternative 1..1
725 'a' 1..1
726 'b' 1..1
727 'c' 1..1
728 Alternative 1..1
729 Group(2) 1..1
730 Alternative 1..1
731 'b' 1..1
732 'c' 1..1
733 'd' 1..1
734 Alternative 1..1
735 'c' 1..1
736 'd' 1..1
737 'e' 1..1
738"
739 );
740 }
741 #[test]
742 fn words_extended() {
743 assert_eq!(
744 compile_extended(br"\<word\>"),
745 "\
746Root 1..1
747 Alternative 1..1
748 < 1..1
749 'w' 1..1
750 'o' 1..1
751 'r' 1..1
752 'd' 1..1
753 > 1..1
754"
755 );
756 }
757
758 #[test]
759 fn repetitions_extended() {
760 assert_eq!(
761 compile_extended(br"yeee*"),
762 "\
763Root 1..1
764 Alternative 1..1
765 'y' 1..1
766 'e' 1..1
767 'e' 1..1
768 'e' 0..
769"
770 );
771 assert_eq!(
772 compile_extended(br"yee?"),
773 "\
774Root 1..1
775 Alternative 1..1
776 'y' 1..1
777 'e' 1..1
778 'e' 0..1
779"
780 );
781 assert_eq!(
782 compile_extended(br"yee+"),
783 "\
784Root 1..1
785 Alternative 1..1
786 'y' 1..1
787 'e' 1..1
788 'e' 1..
789"
790 );
791 assert_eq!(
792 compile_extended(br"ye{2}"),
793 "\
794Root 1..1
795 Alternative 1..1
796 'y' 1..1
797 'e' 2..2
798"
799 );
800 assert_eq!(
801 compile_extended(br"ye{2,}"),
802 "\
803Root 1..1
804 Alternative 1..1
805 'y' 1..1
806 'e' 2..
807"
808 );
809 assert_eq!(
810 compile_extended(br"ye{2,3}"),
811 "\
812Root 1..1
813 Alternative 1..1
814 'y' 1..1
815 'e' 2..3
816"
817 );
818 }
819 #[test]
820 fn bracket_extended() {
821 assert_eq!(
822 compile_extended(b"[abc]"),
823 "\
824Root 1..1
825 Alternative 1..1
826 {invert: false, ['a', 'b', 'c']} 1..1
827"
828 );
829 assert_eq!(
830 compile_extended(b"[^abc]"),
831 "\
832Root 1..1
833 Alternative 1..1
834 {invert: true, ['a', 'b', 'c']} 1..1
835"
836 );
837 assert_eq!(
838 compile_extended(b"[]] [^]]"),
839 "\
840Root 1..1
841 Alternative 1..1
842 {invert: false, [']']} 1..1
843 ' ' 1..1
844 {invert: true, [']']} 1..1
845"
846 );
847 assert_eq!(
848 compile_extended(b"[0-3] [a-c] [-1] [1-]"),
849 "\
850Root 1..1
851 Alternative 1..1
852 {invert: false, ['0', '1', '2', '3']} 1..1
853 ' ' 1..1
854 {invert: false, ['a', 'b', 'c']} 1..1
855 ' ' 1..1
856 {invert: false, ['-', '1']} 1..1
857 ' ' 1..1
858 {invert: false, ['1', '-']} 1..1
859"
860 );
861 assert_eq!(
862 compile_extended(b"[[.-.]-/]"),
863 "\
864Root 1..1
865 Alternative 1..1
866 {invert: false, ['-', '.', '/']} 1..1
867"
868 );
869 assert_eq!(
870 compile_extended(b"[[:digit:][:upper:]]"),
871 format!(
872 "\
873Root 1..1
874 Alternative 1..1
875 {{invert: false, [{:p}, {:p}]}} 1..1
876",
877 ctype::is_digit as fn(u8) -> bool,
878 ctype::is_upper as fn(u8) -> bool
879 )
880 );
881 }
882 #[test]
883 fn newline_extended() {
884 assert_eq!(
885 compile_extended(br"\r\n"),
886 "\
887Root 1..1
888 Alternative 1..1
889 '\\r' 1..1
890 '\\n' 1..1
891"
892 );
893 }
894 #[test]
895 fn backref_extended() {
896 assert_eq!(
897 compile_extended(br"([abc])\1"),
898 "\
899Root 1..1
900 Alternative 1..1
901 Group(1) 1..1
902 Alternative 1..1
903 {invert: false, ['a', 'b', 'c']} 1..1
904 \\1 1..1
905"
906 )
907 }
908}