resharp_algebra/unicode_classes/
mod.rs1mod classes;
2
3use crate::{NodeId, RegexBuilder};
4
5pub use classes::{
6 build_digit_class, build_digit_class_full, build_space_class, build_space_class_full,
7 build_word_class, build_word_class_full,
8};
9
10pub fn utf8_char(b: &mut RegexBuilder) -> NodeId {
12 let ascii = b.mk_range_u8(0, 127);
13 let cont = b.mk_range_u8(0x80, 0xBF);
14 let c2 = b.mk_range_u8(0xC0, 0xDF);
15 let c2s = b.mk_concat(c2, cont);
16 let e0 = b.mk_range_u8(0xE0, 0xEF);
17 let e0s = b.mk_concats([e0, cont, cont].into_iter());
18 let f0 = b.mk_range_u8(0xF0, 0xF7);
19 let f0s = b.mk_concats([f0, cont, cont, cont].into_iter());
20 b.mk_unions([ascii, c2s, e0s, f0s].into_iter())
21}
22
23pub fn neg_class(b: &mut RegexBuilder, positive: NodeId) -> NodeId {
25 let neg = b.mk_compl(positive);
26 let uc = utf8_char(b);
27 b.mk_inters([neg, uc].into_iter())
28}
29
30#[derive(Clone, Debug)]
31pub struct UnicodeClassCache {
32 pub word: NodeId,
33 pub non_word: NodeId,
34 pub digit: NodeId,
35 pub non_digit: NodeId,
36 pub space: NodeId,
37 pub non_space: NodeId,
38 pub wb: NodeId,
39 pub non_wb: NodeId,
40}
41
42impl Default for UnicodeClassCache {
43 fn default() -> Self {
44 UnicodeClassCache {
45 word: NodeId::MISSING,
46 non_word: NodeId::MISSING,
47 digit: NodeId::MISSING,
48 non_digit: NodeId::MISSING,
49 space: NodeId::MISSING,
50 non_space: NodeId::MISSING,
51 wb: NodeId::MISSING,
52 non_wb: NodeId::MISSING,
53 }
54 }
55}
56
57impl UnicodeClassCache {
58 pub fn ensure_word(&mut self, b: &mut RegexBuilder) {
59 if self.word == NodeId::MISSING {
60 self.word = build_word_class(b);
61 self.non_word = neg_class(b, self.word);
62 }
63 }
64
65 pub fn ensure_word_ascii(&mut self, b: &mut RegexBuilder) {
66 if self.word != NodeId::MISSING {
67 return;
68 }
69 let az = b.mk_range_u8(b'a', b'z');
70 let big = b.mk_range_u8(b'A', b'Z');
71 let dig = b.mk_range_u8(b'0', b'9');
72 let us = b.mk_u8(b'_');
73 self.word = b.mk_unions([az, big, dig, us].into_iter());
74 self.non_word = neg_class(b, self.word);
75 }
76
77 pub fn ensure_word_full(&mut self, b: &mut RegexBuilder) {
78 if self.word == NodeId::MISSING {
79 self.word = build_word_class_full(b);
80 self.non_word = neg_class(b, self.word);
81 }
82 }
83
84 pub fn ensure_digit(&mut self, b: &mut RegexBuilder) {
85 if self.digit == NodeId::MISSING {
86 self.digit = build_digit_class(b);
87 self.non_digit = neg_class(b, self.digit);
88 }
89 }
90
91 pub fn ensure_digit_full(&mut self, b: &mut RegexBuilder) {
92 if self.digit == NodeId::MISSING {
93 self.digit = build_digit_class_full(b);
94 self.non_digit = neg_class(b, self.digit);
95 }
96 }
97
98 pub fn ensure_space(&mut self, b: &mut RegexBuilder) {
99 if self.space == NodeId::MISSING {
100 self.space = build_space_class(b);
101 self.non_space = neg_class(b, self.space);
102 }
103 }
104
105 pub fn ensure_space_full(&mut self, b: &mut RegexBuilder) {
106 if self.space == NodeId::MISSING {
107 self.space = build_space_class_full(b);
108 self.non_space = neg_class(b, self.space);
109 }
110 }
111
112 pub fn ensure_wb(&mut self, b: &mut RegexBuilder) {
115 if self.wb != NodeId::MISSING {
116 return;
117 }
118 debug_assert!(
119 self.word != NodeId::MISSING,
120 "call ensure_word(_full|_ascii) first"
121 );
122 let w = self.word;
123 let lb_w = b.mk_lookbehind(w, NodeId::MISSING);
124 let lb_nw = b.mk_neg_lookbehind(w);
125 let la_w = {
126 let tail = b.mk_concat(w, NodeId::TS);
127 b.mk_lookahead(tail, NodeId::MISSING, 0)
128 };
129 let la_nw = b.mk_neg_lookahead(w, 0);
130 let wb_a = b.mk_concat(lb_w, la_nw);
131 let wb_b = b.mk_concat(lb_nw, la_w);
132 self.wb = b.mk_union(wb_a, wb_b);
133 let nwb_a = b.mk_concat(lb_w, la_w);
134 let nwb_b = b.mk_concat(lb_nw, la_nw);
135 self.non_wb = b.mk_union(nwb_a, nwb_b);
136 }
137}