1use pag_lexer::normalization::normalize;
9use pag_lexer::regex_tree::RegexTree;
10use std::rc::Rc;
11
12pub fn encode_char(x: char) -> Rc<RegexTree> {
13 let mut buf = [0; 4];
14 x.encode_utf8(&mut buf)
15 .bytes()
16 .map(RegexTree::single)
17 .reduce(|acc, e| RegexTree::Concat(Rc::new(acc), Rc::new(e)))
18 .map(Rc::new)
19 .unwrap()
20}
21
22fn full_range_2() -> Rc<RegexTree> {
23 Rc::new(RegexTree::Concat(
24 Rc::new(RegexTree::range(0xc0..=0xdf)),
25 Rc::new(RegexTree::range(0x80..=0xbf)),
26 ))
27}
28
29fn full_range_3() -> Rc<RegexTree> {
30 Rc::new(RegexTree::Concat(
31 Rc::new(RegexTree::Concat(
32 Rc::new(RegexTree::range(0xe0..=0xef)),
33 Rc::new(RegexTree::range(0x80..=0xbf)),
34 )),
35 Rc::new(RegexTree::range(0x80..=0xbf)),
36 ))
37}
38
39fn encode_same_level1(x: char, y: char) -> Rc<RegexTree> {
40 encode_same_level_expanded(1, &[x as u8], &[y as u8])
41}
42
43fn encode_same_level2(x: char, y: char) -> Rc<RegexTree> {
44 let x_fst = (0xc0 | (x as u32 >> 6)) as u8;
45 let x_snd = (0x80 | (x as u32 & 0x3f)) as u8;
46 let y_fst = (0xc0 | (y as u32 >> 6)) as u8;
47 let y_snd = (0x80 | (y as u32 & 0x3f)) as u8;
48 encode_same_level_expanded(2, &[x_fst, x_snd], &[y_fst, y_snd])
49}
50
51fn encode_same_level3(x: char, y: char) -> Rc<RegexTree> {
52 let x_fst = (0xe0 | (x as u32 >> 12)) as u8;
53 let x_snd = (0x80 | ((x as u32 >> 6) & 0x3f)) as u8;
54 let x_trd = (0x80 | (x as u32 & 0x3f)) as u8;
55 let y_fst = (0xe0 | (y as u32 >> 12)) as u8;
56 let y_snd = (0x80 | ((y as u32 >> 6) & 0x3f)) as u8;
57 let y_trd = (0x80 | (y as u32 & 0x3f)) as u8;
58 encode_same_level_expanded(3, &[x_fst, x_snd, x_trd], &[y_fst, y_snd, y_trd])
59}
60
61fn encode_same_level4(x: char, y: char) -> Rc<RegexTree> {
62 let x_fst = (0xf0 | (x as u32 >> 18)) as u8;
63 let x_snd = (0x80 | ((x as u32 >> 12) & 0x3f)) as u8;
64 let x_trd = (0x80 | ((x as u32 >> 6) & 0x3f)) as u8;
65 let x_fth = (0x80 | (x as u32 & 0x3f)) as u8;
66 let y_fst = (0xf0 | (y as u32 >> 18)) as u8;
67 let y_snd = (0x80 | ((y as u32 >> 12) & 0x3f)) as u8;
68 let y_trd = (0x80 | ((y as u32 >> 6) & 0x3f)) as u8;
69 let y_fth = (0x80 | (y as u32 & 0x3f)) as u8;
70 encode_same_level_expanded(
71 4,
72 &[x_fst, x_snd, x_trd, x_fth],
73 &[y_fst, y_snd, y_trd, y_fth],
74 )
75}
76
77const ALL_BF: [u8; 4] = [0xbf, 0xbf, 0xbf, 0xbf];
78const ALL_80: [u8; 4] = [0x80, 0x80, 0x80, 0x80];
79
80fn encode_same_level_expanded(level: usize, tuple_x: &[u8], tuple_y: &[u8]) -> Rc<RegexTree> {
81 if level == 1 {
82 return Rc::new(RegexTree::range(tuple_x[0]..=tuple_y[0]));
83 }
84 if tuple_x[0] == tuple_y[0] {
85 Rc::new(RegexTree::Concat(
86 Rc::new(RegexTree::single(tuple_x[0])),
87 encode_same_level_expanded(level - 1, &tuple_x[1..], &tuple_y[1..]),
88 ))
89 } else {
90 Rc::new(RegexTree::Union(
91 Rc::new(RegexTree::Union(
92 Rc::new(RegexTree::Concat(
93 Rc::new(RegexTree::single(tuple_x[0])),
94 encode_same_level_expanded(level - 1, &tuple_x[1..], &ALL_BF),
95 )),
96 Rc::new(RegexTree::Concat(
97 Rc::new(RegexTree::range(tuple_x[0] + 1..=tuple_y[0] - 1)),
98 encode_same_level_expanded(level - 1, &ALL_80, &ALL_BF),
99 )),
100 )),
101 Rc::new(RegexTree::Concat(
102 Rc::new(RegexTree::single(tuple_y[0])),
103 encode_same_level_expanded(level - 1, &ALL_80, &tuple_y[1..]),
104 )),
105 ))
106 }
107}
108
109fn encode_le_expanded(level: usize, fst_bound: u8, tuple: &[u8]) -> Rc<RegexTree> {
110 if level == 1 {
111 return Rc::new(RegexTree::range(fst_bound..=tuple[0]));
112 }
113 Rc::new(RegexTree::Union(
114 Rc::new(RegexTree::Concat(
115 Rc::new(RegexTree::single(tuple[0])),
116 encode_le_expanded(level - 1, 0x80, &tuple[1..]),
117 )),
118 Rc::new(RegexTree::Concat(
119 Rc::new(RegexTree::range(fst_bound..=tuple[0] - 1)),
120 encode_le_expanded(level - 1, 0x80, &ALL_BF),
121 )),
122 ))
123}
124
125fn encode_ge_expanded(level: usize, fst_bound: u8, tuple: &[u8]) -> Rc<RegexTree> {
126 if level == 1 {
127 return Rc::new(RegexTree::range(tuple[0]..=fst_bound));
128 }
129 Rc::new(RegexTree::Union(
130 Rc::new(RegexTree::Concat(
131 Rc::new(RegexTree::single(tuple[0])),
132 encode_ge_expanded(level - 1, 0xBF, &tuple[1..]),
133 )),
134 Rc::new(RegexTree::Concat(
135 Rc::new(RegexTree::range(tuple[0] + 1..=fst_bound)),
136 encode_ge_expanded(level - 1, 0xBF, &ALL_80),
137 )),
138 ))
139}
140
141fn encode_ge1(x: char) -> Rc<RegexTree> {
142 encode_ge_expanded(1, 0x7F, &[x as u8])
143}
144
145fn encode_ge2(x: char) -> Rc<RegexTree> {
146 let x_fst = (0xc0 | (x as u32 >> 6)) as u8;
147 let x_snd = (0x80 | (x as u32 & 0x3f)) as u8;
148 encode_ge_expanded(2, 0xDF, &[x_fst, x_snd])
149}
150
151fn encode_ge3(x: char) -> Rc<RegexTree> {
152 let x_fst = (0xe0 | (x as u32 >> 12)) as u8;
153 let x_snd = (0x80 | ((x as u32 >> 6) & 0x3f)) as u8;
154 let x_trd = (0x80 | (x as u32 & 0x3f)) as u8;
155 encode_ge_expanded(3, 0xEF, &[x_fst, x_snd, x_trd])
156}
157
158fn encode_le2(x: char) -> Rc<RegexTree> {
159 let x_fst = (0xc0 | (x as u32 >> 6)) as u8;
160 let x_snd = (0x80 | (x as u32 & 0x3f)) as u8;
161 encode_le_expanded(2, 0xC0, &[x_fst, x_snd])
162}
163
164fn encode_le3(x: char) -> Rc<RegexTree> {
165 let x_fst = (0xe0 | (x as u32 >> 12)) as u8;
166 let x_snd = (0x80 | ((x as u32 >> 6) & 0x3f)) as u8;
167 let x_trd = (0x80 | (x as u32 & 0x3f)) as u8;
168 encode_le_expanded(3, 0xE0, &[x_fst, x_snd, x_trd])
169}
170
171fn encode_le4(x: char) -> Rc<RegexTree> {
172 let x_fst = (0xf0 | (x as u32 >> 18)) as u8;
173 let x_snd = (0x80 | ((x as u32 >> 12) & 0x3f)) as u8;
174 let x_trd = (0x80 | ((x as u32 >> 6) & 0x3f)) as u8;
175 let x_fth = (0x80 | (x as u32 & 0x3f)) as u8;
176 encode_le_expanded(4, 0xF0, &[x_fst, x_snd, x_trd, x_fth])
177}
178
179fn try_encode_same_level(x: char, y: char) -> Option<Rc<RegexTree>> {
180 match (x as u32, y as u32) {
181 (0x00..=0x7F, 0x00..=0x7F) => Some(encode_same_level1(x, y)),
182 (0x80..=0x7FF, 0x80..=0x7FF) => Some(encode_same_level2(x, y)),
183 (0x800..=0xFFFF, 0x800..=0xFFFF) => Some(encode_same_level3(x, y)),
184 (0x10000..=0x10FFFF, 0x10000..=0x10FFFF) => Some(encode_same_level4(x, y)),
185 _ => None,
186 }
187}
188
189pub fn encode_range(x: char, y: char) -> Rc<RegexTree> {
190 if let Some(tree) = try_encode_same_level(x, y) {
191 return normalize(tree);
192 }
193 let ranges = match (x as u32, y as u32) {
194 (0x00..=0x7F, 0x80..=0x7FF) => vec![encode_ge1(x), encode_le2(y)],
195 (0x00..=0x7F, 0x800..=0xFFFF) => vec![encode_ge1(x), full_range_2(), encode_le3(y)],
196 (0x00..=0x7F, 0x10000..=0x10FFFF) => {
197 vec![encode_ge1(x), full_range_2(), full_range_3(), encode_le4(y)]
198 }
199 (0x80..=0x7FF, 0x800..=0xFFFF) => vec![encode_ge2(x), encode_le3(y)],
200 (0x80..=0x7FF, 0x10000..=0x10FFFF) => vec![encode_ge2(x), full_range_3(), encode_le4(y)],
201 (0x800..=0xFFFF, 0x10000..=0x10FFFF) => vec![encode_ge3(x), encode_le4(y)],
202 _ => unreachable!(),
203 };
204 normalize(ranges.iter().skip(1).fold(ranges[0].clone(), |acc, x| {
206 Rc::new(RegexTree::Union(acc, x.clone()))
207 }))
208}
209
210#[cfg(test)]
211mod test {
212 use super::*;
213
214 #[test]
215 fn test_encode_char() {
216 assert_eq!(encode_char('a').to_string(), "a");
217 assert_eq!(encode_char('b').to_string(), "b");
218 assert_eq!(encode_char('æ').to_string(), r"(\xc3 ~ \xa6)");
219 assert_eq!(encode_char('我').to_string(), r"((\xe6 ~ \x88) ~ \x91)");
220 }
221
222 #[test]
223 fn test_encode_range() {
224 assert_eq!(encode_range('a', 'a').to_string(), "a");
225 assert_eq!(encode_range('a', 'b').to_string(), "[a, b]");
226 assert_eq!(
227 encode_range('\u{80}', '\u{88}').to_string(),
228 r"(\xc2 ~ [\x80, \x88])"
229 );
230 assert_eq!(
231 encode_range('\u{81}', '\u{7FA}').to_string(),
232 r"((\xc2 ~ [\x81, \xbf]) ∪ (([\xc3, \xde] ~ [\x80, \xbf]) ∪ (\xdf ~ [\x80, \xba])))"
233 );
234 assert_eq!(
235 encode_range('\u{800}', '\u{808}').to_string(),
236 r"(\xe0 ~ (\xa0 ~ [\x80, \x88]))"
237 );
238 assert_eq!(
239 encode_range('\u{881}', '\u{FFA}').to_string(),
240 r"(\xe0 ~ ((\xa2 ~ [\x81, \xbf]) ∪ (([\xa3, \xbe] ~ [\x80, \xbf]) ∪ (\xbf ~ [\x80, \xba]))))"
241 );
242 assert_eq!(
243 encode_range('\u{901}', '\u{FF00}').to_string(),
244 r"((\xe0 ~ ((\xa4 ~ [\x81, \xbf]) ∪ ([\xa5, \xbf] ~ [\x80, \xbf]))) ∪ (([\xe1, \xee] ~ ([\x80, \xbf] ~ [\x80, \xbf])) ∪ (\xef ~ (([\x80, \xbb] ~ [\x80, \xbf]) ∪ (\xbc ~ \x80)))))"
245 );
246 assert_eq!(
247 encode_range('a', '\u{90}').to_string(),
248 r"([a, \x7f] ∪ (([\xc0, \xc1] ~ [\x80, \xbf]) ∪ (\xc2 ~ [\x80, \x90])))"
249 );
250 assert_eq!(
251 encode_range('a', '\u{801}').to_string(),
252 r"([a, \x7f] ∪ (([\xc0, \xdf] ~ [\x80, \xbf]) ∪ (\xe0 ~ (([\x80, \x9f] ~ [\x80, \xbf]) ∪ (\xa0 ~ [\x80, \x81])))))"
253 );
254 assert_eq!(
255 encode_range('\u{99}', '\u{2771}').to_string(),
256 r"((\xc2 ~ [\x99, \xbf]) ∪ (([\xc3, \xdf] ~ [\x80, \xbf]) ∪ (([\xe0, \xe1] ~ ([\x80, \xbf] ~ [\x80, \xbf])) ∪ (\xe2 ~ (([\x80, \x9c] ~ [\x80, \xbf]) ∪ (\x9d ~ [\x80, \xb1]))))))"
257 )
258 }
259}