subscript_compiler/frontend/
parser.rs1use std::rc::Rc;
7use std::borrow::Cow;
8use std::collections::{HashSet, VecDeque, LinkedList};
9use std::iter::FromIterator;
10use std::vec;
11use serde::de::value;
12use unicode_segmentation::UnicodeSegmentation;
13
14use crate::frontend::data::*;
15use crate::frontend::ast::*;
16
17
18
19#[derive(Debug)]
24struct Zipper<T> {
25 left: Option<T>,
26 current: T,
27 right: Option<T>,
28}
29
30enum ZipperConsumed {
31 Current,
32 Right,
33}
34
35#[derive(Debug, Clone)]
36pub enum Mode<'a> {
37 BeginEnclosure {
38 kind: &'a str,
39 },
40 EndEnclosure {
41 kind: &'a str,
42 },
43 Ident(&'a str),
44 NoOP,
45}
46
47#[derive(Debug, Clone, PartialEq)]
50pub enum OpenToken {
51 CurlyBrace,
52 SquareParen,
53 Parens,
54}
55
56impl OpenToken {
57 pub fn as_str(&self) -> &'static str {
58 match self {
59 OpenToken::CurlyBrace => "{",
60 OpenToken::SquareParen => "[",
61 OpenToken::Parens => "(",
62 }
63 }
64 pub fn new<'a>(token: Atom<'a>) -> Option<OpenToken> {
65 let token: &str = &token;
66 match (token) {
67 ("{") => Some(OpenToken::CurlyBrace),
68 ("[") => Some(OpenToken::SquareParen),
69 ("(") => Some(OpenToken::Parens),
70 (_) => None,
71 }
72 }
73}
74
75#[derive(Debug, Clone)]
76struct PartialBlock<'a> {
77 open_token: Ann<OpenToken>,
78 children: LinkedList<Node<'a>>,
79}
80
81#[derive(Debug, Clone)]
82enum Branch<'a> {
83 PartialBlock(PartialBlock<'a>),
84 Node(Node<'a>),
85}
86
87#[derive(Debug, Default)]
88pub struct ParseTree<'a> {
89 scopes: VecDeque<PartialBlock<'a>>,
90 finalized: LinkedList<Node<'a>>,
91}
92
93impl<'a> ParseTree<'a> {
98 fn add_child_node(&mut self, new_node: Node<'a>) {
99 match self.scopes.back_mut() {
100 Some(scope) => {
101 scope.children.push_back(new_node);
102 }
103 None => {
104 self.finalized.push_back(new_node);
105 }
106 }
107 }
108 fn open_new_enclosure(&mut self, new_enclosure: PartialBlock<'a>) {
109 self.scopes.push_back(new_enclosure);
110 }
111 fn close_last_enclosure(&mut self, close_word: &Word<'a>) {
112 match self.scopes.pop_back() {
113 Some(scope) => {
114 let new_node = Enclosure {
115 kind: EnclosureKind::new(
116 Cow::Borrowed(scope.open_token.data.as_str()),
117 Cow::Borrowed(close_word.word),
118 ),
119 children: scope.children.into_iter().collect()
120 };
121 let range = {
122 let start = scope.open_token.start();
123 let end = close_word.range.end;
124 CharRange::join(start, Some(end))
125 };
126 self.add_child_node(Node::Enclosure(Ann::join(range, new_node)));
127 }
128 None => {
129 let new_node = Node::InvalidToken(Ann::new(
130 close_word.range,
131 Cow::Borrowed(close_word.word),
132 ));
133 self.add_child_node(new_node);
134 }
135 }
136 }
137 pub fn finalize_all(self) -> Vec<Node<'a>> {
138 let ParseTree { mut scopes, mut finalized } = self;
139 let scopes = scopes.drain(..);
140 let xs = scopes
141 .map(|scope| {
142 let enclosure = Enclosure{
143 kind: EnclosureKind::Error{
144 open: Cow::Borrowed(scope.open_token.data.as_str()),
145 close: None
146 },
147 children: scope.children.into_iter().collect()
148 };
149 Node::Enclosure(Ann::join(
150 scope.open_token.range(),
151 enclosure,
152 ))
153 });
154 finalized.extend(xs);
155 finalized.into_iter().collect()
156 }
157}
158
159impl<'a> ParseTree<'a> {
165 pub fn parse_words(words: Vec<Word<'a>>) -> Vec<Node<'a>> {
166 let mut parse_tree = ParseTree::default();
167 let mut skip_to: Option<usize> = None;
168 for pos in 0..words.len() {
169 if let Some(start_from) = skip_to {
170 if pos <= start_from {
171 continue;
172 } else {
173 skip_to = None;
174 }
175 }
176 let forward = |by: usize| {
177 words
178 .get(pos + by)
179 .filter(|w| !w.is_whitespace())
180 .map(|w| (by, w))
181 };
182 let current = &words[pos];
183 let next = {
184 let mut entry = None::<(usize, &Word)>;
185 let words_left = words.len() - pos;
186 for offset in 1..words_left {
187 assert!(entry.is_none());
188 entry = forward(offset);
189 if entry.is_some() {break}
190 }
191 entry
192 };
193 let (mode, consumed) = match_word(
194 current.word,
195 next.map(|(_, x)| x.word)
196 );
197 match mode {
198 Mode::BeginEnclosure {kind} => {
199 let start_pos = current.range.start;
200 let new_stack = PartialBlock {
201 open_token: Ann::new(
202 current.range,
203 OpenToken::new(Cow::Borrowed(kind)).unwrap()
204 ),
205 children: Default::default(),
206 };
207 parse_tree.open_new_enclosure(new_stack);
208 }
209 Mode::EndEnclosure {kind: close_token} => {
210 parse_tree.close_last_enclosure(current);
211 }
212 Mode::Ident(ident) => {
213 let start = current.range.start;
214 let end = next
215 .map(|x| x.1.range.end)
216 .unwrap_or(current.range.end);
217 let new_node = Node::Ident(Ann::new(
218 CharRange::new(
219 start,
220 end,
221 ),
222 Atom::Borrowed(ident)
223 ));
224 parse_tree.add_child_node(new_node);
225 }
226 Mode::NoOP => {
227 let new_node = Node::String(Ann::new(
228 current.range,
229 Cow::Borrowed(current.word)
230 ));
231 parse_tree.add_child_node(new_node);
232 }
233 }
234 match consumed {
236 ZipperConsumed::Current => (),
237 ZipperConsumed::Right => {
238 assert!(next.is_some());
239 let offset = next.unwrap().0;
240 skip_to = Some(pos + offset);
241 }
242 }
243 }
244 parse_tree.finalize_all()
245 }
246}
247
248
249
250pub fn parse_source<'a>(source: &'a str) -> Vec<Node<'a>> {
257 let words = init_words(source, init_characters(source));
258 ParseTree::parse_words(words)
259}
260
261
262fn process_word<'a>(values: Vec<(CharIndex, &'a str)>) -> Vec<Vec<(CharIndex, &'a str)>> {
267 use itertools::Itertools;
268 values
269 .into_iter()
270 .group_by(|(_, char)| {
271 let char: &str = char;
272 match char {
273 "\\" => true,
274 "{" => true,
275 "}" => true,
276 "[" => true,
277 "]" => true,
278 "(" => true,
279 ")" => true,
280 "=" => true,
281 ">" => true,
282 "_" => true,
283 "^" => true,
284 _ => false
285 }
286 })
287 .into_iter()
288 .flat_map(|(key, group)| -> Vec<Vec<(CharIndex, &str)>> {
289 if key == true {
290 group
291 .into_iter()
292 .map(|(ix, ch)| {
293 vec![(ix, ch)]
294 })
295 .collect::<Vec<_>>()
296 } else {
297 vec![group.into_iter().collect::<Vec<_>>()]
298 }
299 })
300 .collect_vec()
301}
302
303
304
305#[derive(Debug, Clone)]
306pub struct Character<'a> {
307 range: CharRange,
308 char: &'a str,
309}
310
311impl<'a> Character<'a> {
312 pub fn is_whitespace(&self) -> bool {
313 self.char.chars().any(|x| x.is_whitespace())
314 }
315}
316
317pub fn init_characters<'a>(source: &'a str) -> Vec<Character<'a>> {
318 use itertools::Itertools;
319 let ending_byte_size = source.len();
320 let words = source
321 .grapheme_indices(true)
322 .enumerate()
323 .map(|(cix, (bix, x))| {
324 let index = CharIndex {
325 byte_index: bix,
326 char_index: cix,
327 };
328 (index, x)
329 })
330 .collect_vec();
331 let mut output = Vec::new();
332 for pos in 0..words.len() {
333 let (start, current) = words[pos];
334 let end = words
335 .get(pos + 1)
336 .map(|(pos, _)| *pos)
337 .unwrap_or_else(|| {
338 CharIndex {
339 byte_index: ending_byte_size,
340 char_index: pos + 1
341 }
342 });
343 output.push(Character{
344 range: CharRange{ start, end},
345 char: current
346 });
347 }
348 output
349}
350
351#[derive(Debug, Clone)]
352pub struct Word<'a> {
353 range: CharRange,
354 word: &'a str,
355}
356
357impl<'a> Word<'a> {
358 pub fn is_whitespace(&self) -> bool {
359 self.word.trim().is_empty()
360 }
361}
362
363pub fn init_words<'a>(source: &'a str, chars: Vec<Character<'a>>) -> Vec<Word<'a>> {
364 use itertools::Itertools;
365 let mut current_word_start = 0usize;
367 chars
368 .into_iter()
369 .group_by(|char| {
370 if char.is_whitespace() {
371 return true
372 }
373 match char.char {
374 "\\" => true,
375 "{" => true,
376 "}" => true,
377 "[" => true,
378 "]" => true,
379 "(" => true,
380 ")" => true,
381 "=" => true,
382 ">" => true,
383 "_" => true,
384 "." => true,
385 "^" => true,
386 _ => false
387 }
388 })
389 .into_iter()
390 .flat_map(|(key, chars)| {
391 let chars = chars.into_iter().collect_vec();
392 if key || chars.len() < 2 {
393 let chars = chars
394 .into_iter()
395 .map(|char| {
396 Word {
397 range: char.range,
398 word: char.char,
399 }
400 })
401 .collect_vec();
402 return chars;
403 }
404 let start = {
405 (&chars[0]).range.start
406 };
407 let end = {
408 (&chars[chars.len() - 1]).range.end
409 };
410 let word = &source[start.byte_index..end.byte_index];
411 let word = Word {
412 range: CharRange{start, end},
413 word,
414 };
415 vec![word]
416 })
417 .collect::<Vec<_>>()
418}
419
420fn match_word<'a>(current: &'a str, next: Option<&'a str>) -> (Mode<'a>, ZipperConsumed) {
421 match (current, next) {
422 ("\\", Some(next)) if next == "{" => (
423 Mode::Ident(INLINE_MATH_TAG),
424 ZipperConsumed::Current,
425 ),
426 ("\\", Some(ident)) if !is_token(ident) && ident != " " => (
427 Mode::Ident(ident),
428 ZipperConsumed::Right
429 ),
430 (tk @ "{", _) => (
431 Mode::BeginEnclosure{kind: tk},
432 ZipperConsumed::Current
433 ),
434 (tk @ "[", _) => (
435 Mode::BeginEnclosure{kind: tk},
436 ZipperConsumed::Current
437 ),
438 (tk @ "(", _) => (
439 Mode::BeginEnclosure{kind: tk},
440 ZipperConsumed::Current
441 ),
442 (tk @ "}", _) => (
443 Mode::EndEnclosure{kind: tk},
444 ZipperConsumed::Current
445 ),
446 (tk @ "]", _) => (
447 Mode::EndEnclosure{kind: tk},
448 ZipperConsumed::Current
449 ),
450 (tk @ ")", _) => (
451 Mode::EndEnclosure{kind: tk},
452 ZipperConsumed::Current
453 ),
454 _ => (Mode::NoOP, ZipperConsumed::Current),
455 }
456}
457
458
459
460