pag_parser/frontend/
unicode.rs

1// Copyright (c) 2023 Paguroidea Developers
2//
3// Licensed under the Apache License, Version 2.0
4// <LICENSE-APACHE or http://www.apache.org/licenses/LICENSE-2.0> or the MIT
5// license <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
6// option. All files in the project carrying such notice may not be copied,
7// modified, or distributed except according to those terms.
8use pag_lexer::normalization::normalize;
9use pag_lexer::regex_tree::RegexTree;
10use std::rc::Rc;
11
12pub fn encode_char(x: char) -> Rc<RegexTree> {
13    let mut buf = [0; 4];
14    x.encode_utf8(&mut buf)
15        .bytes()
16        .map(RegexTree::single)
17        .reduce(|acc, e| RegexTree::Concat(Rc::new(acc), Rc::new(e)))
18        .map(Rc::new)
19        .unwrap()
20}
21
22fn full_range_2() -> Rc<RegexTree> {
23    Rc::new(RegexTree::Concat(
24        Rc::new(RegexTree::range(0xc0..=0xdf)),
25        Rc::new(RegexTree::range(0x80..=0xbf)),
26    ))
27}
28
29fn full_range_3() -> Rc<RegexTree> {
30    Rc::new(RegexTree::Concat(
31        Rc::new(RegexTree::Concat(
32            Rc::new(RegexTree::range(0xe0..=0xef)),
33            Rc::new(RegexTree::range(0x80..=0xbf)),
34        )),
35        Rc::new(RegexTree::range(0x80..=0xbf)),
36    ))
37}
38
39fn encode_same_level1(x: char, y: char) -> Rc<RegexTree> {
40    encode_same_level_expanded(1, &[x as u8], &[y as u8])
41}
42
43fn encode_same_level2(x: char, y: char) -> Rc<RegexTree> {
44    let x_fst = (0xc0 | (x as u32 >> 6)) as u8;
45    let x_snd = (0x80 | (x as u32 & 0x3f)) as u8;
46    let y_fst = (0xc0 | (y as u32 >> 6)) as u8;
47    let y_snd = (0x80 | (y as u32 & 0x3f)) as u8;
48    encode_same_level_expanded(2, &[x_fst, x_snd], &[y_fst, y_snd])
49}
50
51fn encode_same_level3(x: char, y: char) -> Rc<RegexTree> {
52    let x_fst = (0xe0 | (x as u32 >> 12)) as u8;
53    let x_snd = (0x80 | ((x as u32 >> 6) & 0x3f)) as u8;
54    let x_trd = (0x80 | (x as u32 & 0x3f)) as u8;
55    let y_fst = (0xe0 | (y as u32 >> 12)) as u8;
56    let y_snd = (0x80 | ((y as u32 >> 6) & 0x3f)) as u8;
57    let y_trd = (0x80 | (y as u32 & 0x3f)) as u8;
58    encode_same_level_expanded(3, &[x_fst, x_snd, x_trd], &[y_fst, y_snd, y_trd])
59}
60
61fn encode_same_level4(x: char, y: char) -> Rc<RegexTree> {
62    let x_fst = (0xf0 | (x as u32 >> 18)) as u8;
63    let x_snd = (0x80 | ((x as u32 >> 12) & 0x3f)) as u8;
64    let x_trd = (0x80 | ((x as u32 >> 6) & 0x3f)) as u8;
65    let x_fth = (0x80 | (x as u32 & 0x3f)) as u8;
66    let y_fst = (0xf0 | (y as u32 >> 18)) as u8;
67    let y_snd = (0x80 | ((y as u32 >> 12) & 0x3f)) as u8;
68    let y_trd = (0x80 | ((y as u32 >> 6) & 0x3f)) as u8;
69    let y_fth = (0x80 | (y as u32 & 0x3f)) as u8;
70    encode_same_level_expanded(
71        4,
72        &[x_fst, x_snd, x_trd, x_fth],
73        &[y_fst, y_snd, y_trd, y_fth],
74    )
75}
76
77const ALL_BF: [u8; 4] = [0xbf, 0xbf, 0xbf, 0xbf];
78const ALL_80: [u8; 4] = [0x80, 0x80, 0x80, 0x80];
79
80fn encode_same_level_expanded(level: usize, tuple_x: &[u8], tuple_y: &[u8]) -> Rc<RegexTree> {
81    if level == 1 {
82        return Rc::new(RegexTree::range(tuple_x[0]..=tuple_y[0]));
83    }
84    if tuple_x[0] == tuple_y[0] {
85        Rc::new(RegexTree::Concat(
86            Rc::new(RegexTree::single(tuple_x[0])),
87            encode_same_level_expanded(level - 1, &tuple_x[1..], &tuple_y[1..]),
88        ))
89    } else {
90        Rc::new(RegexTree::Union(
91            Rc::new(RegexTree::Union(
92                Rc::new(RegexTree::Concat(
93                    Rc::new(RegexTree::single(tuple_x[0])),
94                    encode_same_level_expanded(level - 1, &tuple_x[1..], &ALL_BF),
95                )),
96                Rc::new(RegexTree::Concat(
97                    Rc::new(RegexTree::range(tuple_x[0] + 1..=tuple_y[0] - 1)),
98                    encode_same_level_expanded(level - 1, &ALL_80, &ALL_BF),
99                )),
100            )),
101            Rc::new(RegexTree::Concat(
102                Rc::new(RegexTree::single(tuple_y[0])),
103                encode_same_level_expanded(level - 1, &ALL_80, &tuple_y[1..]),
104            )),
105        ))
106    }
107}
108
109fn encode_le_expanded(level: usize, fst_bound: u8, tuple: &[u8]) -> Rc<RegexTree> {
110    if level == 1 {
111        return Rc::new(RegexTree::range(fst_bound..=tuple[0]));
112    }
113    Rc::new(RegexTree::Union(
114        Rc::new(RegexTree::Concat(
115            Rc::new(RegexTree::single(tuple[0])),
116            encode_le_expanded(level - 1, 0x80, &tuple[1..]),
117        )),
118        Rc::new(RegexTree::Concat(
119            Rc::new(RegexTree::range(fst_bound..=tuple[0] - 1)),
120            encode_le_expanded(level - 1, 0x80, &ALL_BF),
121        )),
122    ))
123}
124
125fn encode_ge_expanded(level: usize, fst_bound: u8, tuple: &[u8]) -> Rc<RegexTree> {
126    if level == 1 {
127        return Rc::new(RegexTree::range(tuple[0]..=fst_bound));
128    }
129    Rc::new(RegexTree::Union(
130        Rc::new(RegexTree::Concat(
131            Rc::new(RegexTree::single(tuple[0])),
132            encode_ge_expanded(level - 1, 0xBF, &tuple[1..]),
133        )),
134        Rc::new(RegexTree::Concat(
135            Rc::new(RegexTree::range(tuple[0] + 1..=fst_bound)),
136            encode_ge_expanded(level - 1, 0xBF, &ALL_80),
137        )),
138    ))
139}
140
141fn encode_ge1(x: char) -> Rc<RegexTree> {
142    encode_ge_expanded(1, 0x7F, &[x as u8])
143}
144
145fn encode_ge2(x: char) -> Rc<RegexTree> {
146    let x_fst = (0xc0 | (x as u32 >> 6)) as u8;
147    let x_snd = (0x80 | (x as u32 & 0x3f)) as u8;
148    encode_ge_expanded(2, 0xDF, &[x_fst, x_snd])
149}
150
151fn encode_ge3(x: char) -> Rc<RegexTree> {
152    let x_fst = (0xe0 | (x as u32 >> 12)) as u8;
153    let x_snd = (0x80 | ((x as u32 >> 6) & 0x3f)) as u8;
154    let x_trd = (0x80 | (x as u32 & 0x3f)) as u8;
155    encode_ge_expanded(3, 0xEF, &[x_fst, x_snd, x_trd])
156}
157
158fn encode_le2(x: char) -> Rc<RegexTree> {
159    let x_fst = (0xc0 | (x as u32 >> 6)) as u8;
160    let x_snd = (0x80 | (x as u32 & 0x3f)) as u8;
161    encode_le_expanded(2, 0xC0, &[x_fst, x_snd])
162}
163
164fn encode_le3(x: char) -> Rc<RegexTree> {
165    let x_fst = (0xe0 | (x as u32 >> 12)) as u8;
166    let x_snd = (0x80 | ((x as u32 >> 6) & 0x3f)) as u8;
167    let x_trd = (0x80 | (x as u32 & 0x3f)) as u8;
168    encode_le_expanded(3, 0xE0, &[x_fst, x_snd, x_trd])
169}
170
171fn encode_le4(x: char) -> Rc<RegexTree> {
172    let x_fst = (0xf0 | (x as u32 >> 18)) as u8;
173    let x_snd = (0x80 | ((x as u32 >> 12) & 0x3f)) as u8;
174    let x_trd = (0x80 | ((x as u32 >> 6) & 0x3f)) as u8;
175    let x_fth = (0x80 | (x as u32 & 0x3f)) as u8;
176    encode_le_expanded(4, 0xF0, &[x_fst, x_snd, x_trd, x_fth])
177}
178
179fn try_encode_same_level(x: char, y: char) -> Option<Rc<RegexTree>> {
180    match (x as u32, y as u32) {
181        (0x00..=0x7F, 0x00..=0x7F) => Some(encode_same_level1(x, y)),
182        (0x80..=0x7FF, 0x80..=0x7FF) => Some(encode_same_level2(x, y)),
183        (0x800..=0xFFFF, 0x800..=0xFFFF) => Some(encode_same_level3(x, y)),
184        (0x10000..=0x10FFFF, 0x10000..=0x10FFFF) => Some(encode_same_level4(x, y)),
185        _ => None,
186    }
187}
188
189pub fn encode_range(x: char, y: char) -> Rc<RegexTree> {
190    if let Some(tree) = try_encode_same_level(x, y) {
191        return normalize(tree);
192    }
193    let ranges = match (x as u32, y as u32) {
194        (0x00..=0x7F, 0x80..=0x7FF) => vec![encode_ge1(x), encode_le2(y)],
195        (0x00..=0x7F, 0x800..=0xFFFF) => vec![encode_ge1(x), full_range_2(), encode_le3(y)],
196        (0x00..=0x7F, 0x10000..=0x10FFFF) => {
197            vec![encode_ge1(x), full_range_2(), full_range_3(), encode_le4(y)]
198        }
199        (0x80..=0x7FF, 0x800..=0xFFFF) => vec![encode_ge2(x), encode_le3(y)],
200        (0x80..=0x7FF, 0x10000..=0x10FFFF) => vec![encode_ge2(x), full_range_3(), encode_le4(y)],
201        (0x800..=0xFFFF, 0x10000..=0x10FFFF) => vec![encode_ge3(x), encode_le4(y)],
202        _ => unreachable!(),
203    };
204    // fold union
205    normalize(ranges.iter().skip(1).fold(ranges[0].clone(), |acc, x| {
206        Rc::new(RegexTree::Union(acc, x.clone()))
207    }))
208}
209
210#[cfg(test)]
211mod test {
212    use super::*;
213
214    #[test]
215    fn test_encode_char() {
216        assert_eq!(encode_char('a').to_string(), "a");
217        assert_eq!(encode_char('b').to_string(), "b");
218        assert_eq!(encode_char('æ').to_string(), r"(\xc3 ~ \xa6)");
219        assert_eq!(encode_char('我').to_string(), r"((\xe6 ~ \x88) ~ \x91)");
220    }
221
222    #[test]
223    fn test_encode_range() {
224        assert_eq!(encode_range('a', 'a').to_string(), "a");
225        assert_eq!(encode_range('a', 'b').to_string(), "[a, b]");
226        assert_eq!(
227            encode_range('\u{80}', '\u{88}').to_string(),
228            r"(\xc2 ~ [\x80, \x88])"
229        );
230        assert_eq!(
231            encode_range('\u{81}', '\u{7FA}').to_string(),
232            r"((\xc2 ~ [\x81, \xbf]) ∪ (([\xc3, \xde] ~ [\x80, \xbf]) ∪ (\xdf ~ [\x80, \xba])))"
233        );
234        assert_eq!(
235            encode_range('\u{800}', '\u{808}').to_string(),
236            r"(\xe0 ~ (\xa0 ~ [\x80, \x88]))"
237        );
238        assert_eq!(
239            encode_range('\u{881}', '\u{FFA}').to_string(),
240            r"(\xe0 ~ ((\xa2 ~ [\x81, \xbf]) ∪ (([\xa3, \xbe] ~ [\x80, \xbf]) ∪ (\xbf ~ [\x80, \xba]))))"
241        );
242        assert_eq!(
243            encode_range('\u{901}', '\u{FF00}').to_string(),
244            r"((\xe0 ~ ((\xa4 ~ [\x81, \xbf]) ∪ ([\xa5, \xbf] ~ [\x80, \xbf]))) ∪ (([\xe1, \xee] ~ ([\x80, \xbf] ~ [\x80, \xbf])) ∪ (\xef ~ (([\x80, \xbb] ~ [\x80, \xbf]) ∪ (\xbc ~ \x80)))))"
245        );
246        assert_eq!(
247            encode_range('a', '\u{90}').to_string(),
248            r"([a, \x7f] ∪ (([\xc0, \xc1] ~ [\x80, \xbf]) ∪ (\xc2 ~ [\x80, \x90])))"
249        );
250        assert_eq!(
251            encode_range('a', '\u{801}').to_string(),
252            r"([a, \x7f] ∪ (([\xc0, \xdf] ~ [\x80, \xbf]) ∪ (\xe0 ~ (([\x80, \x9f] ~ [\x80, \xbf]) ∪ (\xa0 ~ [\x80, \x81])))))"
253        );
254        assert_eq!(
255            encode_range('\u{99}', '\u{2771}').to_string(),
256            r"((\xc2 ~ [\x99, \xbf]) ∪ (([\xc3, \xdf] ~ [\x80, \xbf]) ∪ (([\xe0, \xe1] ~ ([\x80, \xbf] ~ [\x80, \xbf])) ∪ (\xe2 ~ (([\x80, \x9c] ~ [\x80, \xbf]) ∪ (\x9d ~ [\x80, \xb1]))))))"
257        )
258    }
259}