Skip to main content

resharp_algebra/unicode_classes/
mod.rs

1mod classes;
2
3use crate::{NodeId, RegexBuilder};
4
5pub use classes::{
6    build_digit_class, build_digit_class_full, build_space_class, build_word_class,
7    build_word_class_full,
8};
9
10/// Node matching any single UTF-8 codepoint.
11pub fn utf8_char(b: &mut RegexBuilder) -> NodeId {
12    let ascii = b.mk_range_u8(0, 127);
13    let cont = b.mk_range_u8(0x80, 0xBF);
14    let c2 = b.mk_range_u8(0xC0, 0xDF);
15    let c2s = b.mk_concat(c2, cont);
16    let e0 = b.mk_range_u8(0xE0, 0xEF);
17    let e0s = b.mk_concats([e0, cont, cont].into_iter());
18    let f0 = b.mk_range_u8(0xF0, 0xF7);
19    let f0s = b.mk_concats([f0, cont, cont, cont].into_iter());
20    b.mk_unions([ascii, c2s, e0s, f0s].into_iter())
21}
22
23/// Complement of `positive` restricted to the UTF-8 codepoint universe.
24pub fn neg_class(b: &mut RegexBuilder, positive: NodeId) -> NodeId {
25    let neg = b.mk_compl(positive);
26    let uc = utf8_char(b);
27    b.mk_inters([neg, uc].into_iter())
28}
29
30#[derive(Clone, Debug)]
31pub struct UnicodeClassCache {
32    pub word: NodeId,
33    pub non_word: NodeId,
34    pub digit: NodeId,
35    pub non_digit: NodeId,
36    pub space: NodeId,
37    pub non_space: NodeId,
38}
39
40impl Default for UnicodeClassCache {
41    fn default() -> Self {
42        UnicodeClassCache {
43            word: NodeId::MISSING,
44            non_word: NodeId::MISSING,
45            digit: NodeId::MISSING,
46            non_digit: NodeId::MISSING,
47            space: NodeId::MISSING,
48            non_space: NodeId::MISSING,
49        }
50    }
51}
52
53impl UnicodeClassCache {
54    pub fn ensure_word(&mut self, b: &mut RegexBuilder) {
55        if self.word == NodeId::MISSING {
56            self.word = build_word_class(b);
57            self.non_word = neg_class(b, self.word);
58        }
59    }
60
61    pub fn ensure_word_full(&mut self, b: &mut RegexBuilder) {
62        if self.word == NodeId::MISSING {
63            self.word = build_word_class_full(b);
64            self.non_word = neg_class(b, self.word);
65        }
66    }
67
68    pub fn ensure_digit(&mut self, b: &mut RegexBuilder) {
69        if self.digit == NodeId::MISSING {
70            self.digit = build_digit_class(b);
71            self.non_digit = neg_class(b, self.digit);
72        }
73    }
74
75    pub fn ensure_digit_full(&mut self, b: &mut RegexBuilder) {
76        if self.digit == NodeId::MISSING {
77            self.digit = build_digit_class_full(b);
78            self.non_digit = neg_class(b, self.digit);
79        }
80    }
81
82    pub fn ensure_space(&mut self, b: &mut RegexBuilder) {
83        if self.space == NodeId::MISSING {
84            self.space = build_space_class(b);
85            self.non_space = neg_class(b, self.space);
86        }
87    }
88}