1pub fn add(left: u64, right: u64) -> u64 {
2 left + right
3}
4
5use std::collections::{BTreeMap, BTreeSet};
6use std::iter;
7
8use cfg_grammar::Cfg;
9use cfg_sequence::CfgSequenceExt;
10use cfg_symbol::Symbol;
11use regex_syntax::Parser;
12use regex_syntax::hir::{Class, Hir, HirKind};
13
14pub trait CfgRegexpExt: Sized {
15 fn from_regexp(regexp: &str) -> Result<(Self, LexerMap), regex_syntax::Error>;
16}
17
18impl CfgRegexpExt for Cfg {
19 fn from_regexp(regexp: &str) -> Result<(Self, LexerMap), regex_syntax::Error> {
20 Parser::new().parse(regexp).map(Translator::cfg_from_hir)
21 }
22}
23
24#[derive(PartialEq, PartialOrd, Eq, Ord, Debug, Clone)]
25pub struct LexerClasses {
26 set: BTreeSet<(char, char)>,
27}
28
29impl From<u8> for LexerClasses {
30 fn from(value: u8) -> Self {
31 LexerClasses {
32 set: iter::once((value as char, (value + 1) as char)).collect(),
33 }
34 }
35}
36
37impl From<char> for LexerClasses {
38 fn from(value: char) -> Self {
39 LexerClasses {
40 set: iter::once((value, char::from_u32(value as u32 + 1).unwrap())).collect(),
41 }
42 }
43}
44
45impl From<Class> for LexerClasses {
46 fn from(value: Class) -> Self {
47 let set: BTreeSet<(char, char)> = match value {
48 Class::Bytes(bytes) => bytes
49 .ranges()
50 .iter()
51 .map(|range| (range.start() as char, range.end() as char))
52 .collect(),
53 Class::Unicode(unicode) => unicode
54 .ranges()
55 .iter()
56 .map(|range| (range.start(), range.end()))
57 .collect(),
58 };
59 LexerClasses { set }
60 }
61}
62
63impl LexerClasses {
64 pub fn iter(&self) -> impl Iterator<Item = (char, char)> + use<'_> {
65 self.set.iter().copied()
66 }
67}
68
69struct Translator {
70 cfg: Cfg,
71 class_map: LexerMap,
72}
73
74#[derive(Debug, Clone)]
75pub struct LexerMap {
76 pub classes: BTreeMap<LexerClasses, Symbol>,
77 ascii: Vec<Vec<Symbol>>,
78 ranges: BTreeMap<char, Vec<Symbol>>,
79}
80
81impl LexerMap {
82 pub fn new() -> Self {
83 LexerMap {
84 classes: BTreeMap::new(),
85 ascii: vec![],
86 ranges: BTreeMap::new(),
87 }
88 }
89
90 pub fn compute(&mut self) {
91 let mut result = vec![vec![]; 256];
92 for (lexer_classes, &symbol) in &self.classes {
93 for &class in &lexer_classes.set {
94 if class.0.is_ascii() {
95 for ascii in class.0 as u32..=(class.1 as u32).min(256) {
96 result[ascii as usize].push(symbol);
97 }
98 }
99 }
100 }
101 self.ascii = result;
102 let mut ranges = BTreeMap::new();
103 for (lexer_classes, &symbol) in &self.classes {
104 for &class in &lexer_classes.set {
105 ranges.entry(class.0).or_insert(vec![]).push((true, symbol));
106 ranges
107 .entry(char::from_u32(class.1 as u32 + 1).unwrap())
108 .or_insert(vec![])
109 .push((false, symbol));
110 }
111 }
112 let mut result = BTreeMap::new();
113 let mut work = BTreeSet::new();
114 for (ch, changes) in ranges {
115 for (is_added, symbol) in changes {
116 if is_added {
117 work.insert(symbol);
118 } else {
119 work.remove(&symbol);
120 }
121 }
122 result
123 .entry(ch)
124 .or_insert(vec![])
125 .extend(work.iter().copied());
126 }
127 self.ranges = result;
128 }
129
130 pub fn get(&self, ch: char) -> &[Symbol] {
131 if ch.is_ascii() {
132 &self.ascii[ch as usize][..]
133 } else {
134 self.ranges
135 .range(..=ch)
136 .next_back()
137 .map(|(_, v)| &v[..])
138 .unwrap_or(&[])
139 }
140 }
141}
142
143impl Translator {
144 fn cfg_from_hir(hir: Hir) -> (Cfg, LexerMap) {
145 let cfg = Cfg::new();
146 let class_map = LexerMap::new();
147 let mut this = Self { cfg, class_map };
148 let x = this.walk_hir(&hir, 0);
149 let lhs = match (x.len(), x.get(0).map_or(0, |y| y.len())) {
150 (0, _) => {
151 let [nulling] = this.cfg.sym();
152 this.cfg.rule(nulling).rhs([]);
153 nulling
154 }
155 (1, 1) => x[0][0],
156 (1, _) => {
157 let [inner] = this.cfg.sym();
158 this.cfg.rule(inner).rhs(&x[0][..]);
159 inner
160 }
161 _ => {
162 let [inner] = this.cfg.sym();
163 for rule in x {
164 this.cfg.rule(inner).rhs(&*rule);
165 }
166 inner
167 }
168 };
169 this.cfg.set_roots([lhs]);
170 (this.cfg, this.class_map)
171 }
172
173 fn walk_hir(&mut self, hir: &Hir, depth: usize) -> Vec<Vec<Symbol>> {
174 let indent = " ".repeat(depth);
175
176 match hir.kind() {
177 HirKind::Literal(lit) => {
178 let mut syms = vec![];
179 for &byte in &lit.0 {
180 syms.push(
181 *self
182 .class_map
183 .classes
184 .entry(byte.into())
185 .or_insert_with(|| {
186 self.cfg.next_sym(Some(format!("__byte{}", byte).into()))
187 }),
188 );
189 }
190 println!("{indent}Literal: {:?}", lit);
191 vec![syms]
192 }
193 HirKind::Class(class) => {
194 let sym = *self
195 .class_map
196 .classes
197 .entry(class.clone().into())
198 .or_insert_with(|| self.cfg.next_sym(None));
199 println!("{indent}Class: {:?}", class);
200 vec![vec![sym]]
201 }
202 HirKind::Repetition(rep) => {
203 println!("{indent}Repetition: {:?}", (rep.min, rep.max, rep.greedy));
204 let [lhs] = self.cfg.sym();
205 let x = self.walk_hir(&rep.sub, depth + 1);
206 let rhs = match (x.len(), x.get(0).map_or(0, |y| y.len())) {
207 (0, _) => {
208 unreachable!()
209 }
210 (1, 1) => x[0][0],
211 (1, _) => {
212 let [inner] = self.cfg.sym();
213 self.cfg.rule(inner).rhs(&x[0][..]);
214 inner
215 }
216 _ => {
217 let [inner] = self.cfg.sym();
218 for rule in x {
219 self.cfg.rule(inner).rhs(&*rule);
220 }
221 inner
222 }
223 };
224 self.cfg.sequence(lhs).inclusive(rep.min, rep.max).rhs(rhs);
225 vec![vec![lhs]]
226 }
227 HirKind::Capture(group) => {
228 println!("{indent}Group (capturing={}):", group.name.is_some());
229 self.walk_hir(&group.sub, depth + 1)
230 }
231 HirKind::Concat(exprs) => {
232 println!("{indent}Concat:");
233 let mut result = vec![];
234 for expr in exprs {
235 let x = self.walk_hir(expr, depth + 1);
236 match x.len() {
237 0 => {}
238 1 => {
239 result.extend(x.into_iter().next().unwrap());
240 }
241 _ => {
242 let [lhs] = self.cfg.sym();
243 for rule in x {
244 self.cfg.rule(lhs).rhs(&*rule);
245 }
246 result.push(lhs);
247 }
248 }
249 }
250 vec![result]
251 }
252 HirKind::Alternation(exprs) => {
253 println!("{indent}Alternation:");
254 let mut alternatives = vec![];
255 for expr in exprs {
256 alternatives.extend(self.walk_hir(expr, depth + 1));
257 }
258 alternatives
259 }
260 HirKind::Look(_look) => {
261 unimplemented!()
262 }
263 HirKind::Empty => {
264 println!("{indent}Empty");
265 vec![vec![]]
266 }
267 }
268 }
269}
270
271#[cfg(test)]
272mod tests {
273 use super::*;
274
275 #[test]
276 fn it_works() {
277 let result = add(2, 2);
278 assert_eq!(result, 4);
279
280 let pattern = r"(?i)(foo|bar)\d+";
281 let (result, mut map) = Cfg::from_regexp(pattern).unwrap();
282 assert_eq!(result.rules().count(), 5);
283 map.compute();
284 assert_eq!(map.get('b').len(), 1);
285 assert_eq!(map.get('c').len(), 0);
286 assert_eq!(map.get('B').len(), 1);
287 assert_eq!(map.get('D').len(), 0);
288 assert_eq!(map.get('o').len(), 1);
289 assert_eq!(map.get('🯰').len(), 1);
290 assert_eq!(map.get('🯹').len(), 1);
291 }
292}