cfg_regexp/
lib.rs

1pub fn add(left: u64, right: u64) -> u64 {
2    left + right
3}
4
5use std::collections::{BTreeMap, BTreeSet};
6use std::iter;
7
8use cfg_grammar::Cfg;
9use cfg_sequence::CfgSequenceExt;
10use cfg_symbol::Symbol;
11use regex_syntax::Parser;
12use regex_syntax::hir::{Class, Hir, HirKind};
13
14pub trait CfgRegexpExt: Sized {
15    fn from_regexp(regexp: &str) -> Result<(Self, LexerMap), regex_syntax::Error>;
16}
17
18impl CfgRegexpExt for Cfg {
19    fn from_regexp(regexp: &str) -> Result<(Self, LexerMap), regex_syntax::Error> {
20        Parser::new().parse(regexp).map(Translator::cfg_from_hir)
21    }
22}
23
24#[derive(PartialEq, PartialOrd, Eq, Ord, Debug, Clone)]
25pub struct LexerClasses {
26    set: BTreeSet<(char, char)>,
27}
28
29impl From<u8> for LexerClasses {
30    fn from(value: u8) -> Self {
31        LexerClasses {
32            set: iter::once((value as char, (value + 1) as char)).collect(),
33        }
34    }
35}
36
37impl From<char> for LexerClasses {
38    fn from(value: char) -> Self {
39        LexerClasses {
40            set: iter::once((value, char::from_u32(value as u32 + 1).unwrap())).collect(),
41        }
42    }
43}
44
45impl From<Class> for LexerClasses {
46    fn from(value: Class) -> Self {
47        let set: BTreeSet<(char, char)> = match value {
48            Class::Bytes(bytes) => bytes
49                .ranges()
50                .iter()
51                .map(|range| (range.start() as char, range.end() as char))
52                .collect(),
53            Class::Unicode(unicode) => unicode
54                .ranges()
55                .iter()
56                .map(|range| (range.start(), range.end()))
57                .collect(),
58        };
59        LexerClasses { set }
60    }
61}
62
63impl LexerClasses {
64    pub fn iter(&self) -> impl Iterator<Item = (char, char)> + use<'_> {
65        self.set.iter().copied()
66    }
67}
68
69struct Translator {
70    cfg: Cfg,
71    class_map: LexerMap,
72}
73
74#[derive(Debug, Clone)]
75pub struct LexerMap {
76    pub classes: BTreeMap<LexerClasses, Symbol>,
77    ascii: Vec<Vec<Symbol>>,
78    ranges: BTreeMap<char, Vec<Symbol>>,
79}
80
81impl LexerMap {
82    pub fn new() -> Self {
83        LexerMap {
84            classes: BTreeMap::new(),
85            ascii: vec![],
86            ranges: BTreeMap::new(),
87        }
88    }
89
90    pub fn compute(&mut self) {
91        let mut result = vec![vec![]; 256];
92        for (lexer_classes, &symbol) in &self.classes {
93            for &class in &lexer_classes.set {
94                if class.0.is_ascii() {
95                    for ascii in class.0 as u32..=(class.1 as u32).min(256) {
96                        result[ascii as usize].push(symbol);
97                    }
98                }
99            }
100        }
101        self.ascii = result;
102        let mut ranges = BTreeMap::new();
103        for (lexer_classes, &symbol) in &self.classes {
104            for &class in &lexer_classes.set {
105                ranges.entry(class.0).or_insert(vec![]).push((true, symbol));
106                ranges
107                    .entry(char::from_u32(class.1 as u32 + 1).unwrap())
108                    .or_insert(vec![])
109                    .push((false, symbol));
110            }
111        }
112        let mut result = BTreeMap::new();
113        let mut work = BTreeSet::new();
114        for (ch, changes) in ranges {
115            for (is_added, symbol) in changes {
116                if is_added {
117                    work.insert(symbol);
118                } else {
119                    work.remove(&symbol);
120                }
121            }
122            result
123                .entry(ch)
124                .or_insert(vec![])
125                .extend(work.iter().copied());
126        }
127        self.ranges = result;
128    }
129
130    pub fn get(&self, ch: char) -> &[Symbol] {
131        if ch.is_ascii() {
132            &self.ascii[ch as usize][..]
133        } else {
134            self.ranges
135                .range(..=ch)
136                .next_back()
137                .map(|(_, v)| &v[..])
138                .unwrap_or(&[])
139        }
140    }
141}
142
143impl Translator {
144    fn cfg_from_hir(hir: Hir) -> (Cfg, LexerMap) {
145        let cfg = Cfg::new();
146        let class_map = LexerMap::new();
147        let mut this = Self { cfg, class_map };
148        let x = this.walk_hir(&hir, 0);
149        let lhs = match (x.len(), x.get(0).map_or(0, |y| y.len())) {
150            (0, _) => {
151                let [nulling] = this.cfg.sym();
152                this.cfg.rule(nulling).rhs([]);
153                nulling
154            }
155            (1, 1) => x[0][0],
156            (1, _) => {
157                let [inner] = this.cfg.sym();
158                this.cfg.rule(inner).rhs(&x[0][..]);
159                inner
160            }
161            _ => {
162                let [inner] = this.cfg.sym();
163                for rule in x {
164                    this.cfg.rule(inner).rhs(&*rule);
165                }
166                inner
167            }
168        };
169        this.cfg.set_roots([lhs]);
170        (this.cfg, this.class_map)
171    }
172
173    fn walk_hir(&mut self, hir: &Hir, depth: usize) -> Vec<Vec<Symbol>> {
174        let indent = "  ".repeat(depth);
175
176        match hir.kind() {
177            HirKind::Literal(lit) => {
178                let mut syms = vec![];
179                for &byte in &lit.0 {
180                    syms.push(
181                        *self
182                            .class_map
183                            .classes
184                            .entry(byte.into())
185                            .or_insert_with(|| {
186                                self.cfg.next_sym(Some(format!("__byte{}", byte).into()))
187                            }),
188                    );
189                }
190                println!("{indent}Literal: {:?}", lit);
191                vec![syms]
192            }
193            HirKind::Class(class) => {
194                let sym = *self
195                    .class_map
196                    .classes
197                    .entry(class.clone().into())
198                    .or_insert_with(|| self.cfg.next_sym(None));
199                println!("{indent}Class: {:?}", class);
200                vec![vec![sym]]
201            }
202            HirKind::Repetition(rep) => {
203                println!("{indent}Repetition: {:?}", (rep.min, rep.max, rep.greedy));
204                let [lhs] = self.cfg.sym();
205                let x = self.walk_hir(&rep.sub, depth + 1);
206                let rhs = match (x.len(), x.get(0).map_or(0, |y| y.len())) {
207                    (0, _) => {
208                        unreachable!()
209                    }
210                    (1, 1) => x[0][0],
211                    (1, _) => {
212                        let [inner] = self.cfg.sym();
213                        self.cfg.rule(inner).rhs(&x[0][..]);
214                        inner
215                    }
216                    _ => {
217                        let [inner] = self.cfg.sym();
218                        for rule in x {
219                            self.cfg.rule(inner).rhs(&*rule);
220                        }
221                        inner
222                    }
223                };
224                self.cfg.sequence(lhs).inclusive(rep.min, rep.max).rhs(rhs);
225                vec![vec![lhs]]
226            }
227            HirKind::Capture(group) => {
228                println!("{indent}Group (capturing={}):", group.name.is_some());
229                self.walk_hir(&group.sub, depth + 1)
230            }
231            HirKind::Concat(exprs) => {
232                println!("{indent}Concat:");
233                let mut result = vec![];
234                for expr in exprs {
235                    let x = self.walk_hir(expr, depth + 1);
236                    match x.len() {
237                        0 => {}
238                        1 => {
239                            result.extend(x.into_iter().next().unwrap());
240                        }
241                        _ => {
242                            let [lhs] = self.cfg.sym();
243                            for rule in x {
244                                self.cfg.rule(lhs).rhs(&*rule);
245                            }
246                            result.push(lhs);
247                        }
248                    }
249                }
250                vec![result]
251            }
252            HirKind::Alternation(exprs) => {
253                println!("{indent}Alternation:");
254                let mut alternatives = vec![];
255                for expr in exprs {
256                    alternatives.extend(self.walk_hir(expr, depth + 1));
257                }
258                alternatives
259            }
260            HirKind::Look(_look) => {
261                unimplemented!()
262            }
263            HirKind::Empty => {
264                println!("{indent}Empty");
265                vec![vec![]]
266            }
267        }
268    }
269}
270
271#[cfg(test)]
272mod tests {
273    use super::*;
274
275    #[test]
276    fn it_works() {
277        let result = add(2, 2);
278        assert_eq!(result, 4);
279
280        let pattern = r"(?i)(foo|bar)\d+";
281        let (result, mut map) = Cfg::from_regexp(pattern).unwrap();
282        assert_eq!(result.rules().count(), 5);
283        map.compute();
284        assert_eq!(map.get('b').len(), 1);
285        assert_eq!(map.get('c').len(), 0);
286        assert_eq!(map.get('B').len(), 1);
287        assert_eq!(map.get('D').len(), 0);
288        assert_eq!(map.get('o').len(), 1);
289        assert_eq!(map.get('🯰').len(), 1);
290        assert_eq!(map.get('🯹').len(), 1);
291    }
292}