Skip to main content

resharp_algebra/unicode_classes/
mod.rs

1mod classes;
2
3use crate::{NodeId, RegexBuilder};
4
5pub use classes::{
6    build_digit_class, build_digit_class_full, build_space_class, build_space_class_full,
7    build_word_class, build_word_class_full,
8};
9
10/// Node matching any single UTF-8 codepoint.
11pub fn utf8_char(b: &mut RegexBuilder) -> NodeId {
12    let ascii = b.mk_range_u8(0, 127);
13    let cont = b.mk_range_u8(0x80, 0xBF);
14    let c2 = b.mk_range_u8(0xC0, 0xDF);
15    let c2s = b.mk_concat(c2, cont);
16    let e0 = b.mk_range_u8(0xE0, 0xEF);
17    let e0s = b.mk_concats([e0, cont, cont].into_iter());
18    let f0 = b.mk_range_u8(0xF0, 0xF7);
19    let f0s = b.mk_concats([f0, cont, cont, cont].into_iter());
20    b.mk_unions([ascii, c2s, e0s, f0s].into_iter())
21}
22
23/// Complement of `positive` restricted to the UTF-8 codepoint universe.
24pub fn neg_class(b: &mut RegexBuilder, positive: NodeId) -> NodeId {
25    let neg = b.mk_compl(positive);
26    let uc = utf8_char(b);
27    b.mk_inters([neg, uc].into_iter())
28}
29
30#[derive(Clone, Debug)]
31pub struct UnicodeClassCache {
32    pub word: NodeId,
33    pub non_word: NodeId,
34    pub digit: NodeId,
35    pub non_digit: NodeId,
36    pub space: NodeId,
37    pub non_space: NodeId,
38    pub wb: NodeId,
39    pub non_wb: NodeId,
40}
41
42impl Default for UnicodeClassCache {
43    fn default() -> Self {
44        UnicodeClassCache {
45            word: NodeId::MISSING,
46            non_word: NodeId::MISSING,
47            digit: NodeId::MISSING,
48            non_digit: NodeId::MISSING,
49            space: NodeId::MISSING,
50            non_space: NodeId::MISSING,
51            wb: NodeId::MISSING,
52            non_wb: NodeId::MISSING,
53        }
54    }
55}
56
57impl UnicodeClassCache {
58    pub fn ensure_word(&mut self, b: &mut RegexBuilder) {
59        if self.word == NodeId::MISSING {
60            self.word = build_word_class(b);
61            self.non_word = neg_class(b, self.word);
62        }
63    }
64
65    pub fn ensure_word_ascii(&mut self, b: &mut RegexBuilder) {
66        if self.word != NodeId::MISSING {
67            return;
68        }
69        let az = b.mk_range_u8(b'a', b'z');
70        let big = b.mk_range_u8(b'A', b'Z');
71        let dig = b.mk_range_u8(b'0', b'9');
72        let us = b.mk_u8(b'_');
73        self.word = b.mk_unions([az, big, dig, us].into_iter());
74        self.non_word = neg_class(b, self.word);
75    }
76
77    pub fn ensure_word_full(&mut self, b: &mut RegexBuilder) {
78        if self.word == NodeId::MISSING {
79            self.word = build_word_class_full(b);
80            self.non_word = neg_class(b, self.word);
81        }
82    }
83
84    pub fn ensure_digit(&mut self, b: &mut RegexBuilder) {
85        if self.digit == NodeId::MISSING {
86            self.digit = build_digit_class(b);
87            self.non_digit = neg_class(b, self.digit);
88        }
89    }
90
91    pub fn ensure_digit_full(&mut self, b: &mut RegexBuilder) {
92        if self.digit == NodeId::MISSING {
93            self.digit = build_digit_class_full(b);
94            self.non_digit = neg_class(b, self.digit);
95        }
96    }
97
98    pub fn ensure_space(&mut self, b: &mut RegexBuilder) {
99        if self.space == NodeId::MISSING {
100            self.space = build_space_class(b);
101            self.non_space = neg_class(b, self.space);
102        }
103    }
104
105    pub fn ensure_space_full(&mut self, b: &mut RegexBuilder) {
106        if self.space == NodeId::MISSING {
107            self.space = build_space_class_full(b);
108            self.non_space = neg_class(b, self.space);
109        }
110    }
111
112    // \b  = (?<=\w)(?!\w) | (?<!\w)(?=\w)
113    // \B  = (?<=\w)(?=\w)  | (?<!\w)(?!\w)
114    pub fn ensure_wb(&mut self, b: &mut RegexBuilder) {
115        if self.wb != NodeId::MISSING {
116            return;
117        }
118        debug_assert!(
119            self.word != NodeId::MISSING,
120            "call ensure_word(_full|_ascii) first"
121        );
122        let w = self.word;
123        let lb_w = b.mk_lookbehind(w, NodeId::MISSING);
124        let lb_nw = b.mk_neg_lookbehind(w);
125        let la_w = {
126            let tail = b.mk_concat(w, NodeId::TS);
127            b.mk_lookahead(tail, NodeId::MISSING, 0)
128        };
129        let la_nw = b.mk_neg_lookahead(w, 0);
130        let wb_a = b.mk_concat(lb_w, la_nw);
131        let wb_b = b.mk_concat(lb_nw, la_w);
132        self.wb = b.mk_union(wb_a, wb_b);
133        let nwb_a = b.mk_concat(lb_w, la_w);
134        let nwb_b = b.mk_concat(lb_nw, la_nw);
135        self.non_wb = b.mk_union(nwb_a, nwb_b);
136    }
137}