subscript_compiler/frontend/
parser.rs

1//! The parser herein is supposed to meet the following criteria:
2//! * real-time parsing (suitable for IDE syntax highlighting).
3//! * zero-copy parsing (only copying pointers).
4//! * fault tolerant parsing; again, so it can be used in IDE/text editors.
5//! Eventually I’d like to support incremental parsing as well. 
6use std::rc::Rc;
7use std::borrow::Cow;
8use std::collections::{HashSet, VecDeque, LinkedList};
9use std::iter::FromIterator;
10use std::vec;
11use serde::de::value;
12use unicode_segmentation::UnicodeSegmentation;
13
14use crate::frontend::data::*;
15use crate::frontend::ast::*;
16
17
18
19///////////////////////////////////////////////////////////////////////////////
20// INTERNAL PARSER TYPES
21///////////////////////////////////////////////////////////////////////////////
22
23#[derive(Debug)]
24struct Zipper<T> {
25    left: Option<T>,
26    current: T,
27    right: Option<T>,
28}
29
30enum ZipperConsumed {
31    Current,
32    Right,
33}
34
35#[derive(Debug, Clone)]
36pub enum Mode<'a> {
37    BeginEnclosure {
38        kind: &'a str,
39    },
40    EndEnclosure {
41        kind: &'a str,
42    },
43    Ident(&'a str),
44    NoOP,
45}
46
47// type BeginEnclosureStack<'a> = VecDeque<(&'a str, CharIndex, LinkedList<Node<'a>>)>;
48
49#[derive(Debug, Clone, PartialEq)]
50pub enum OpenToken {
51    CurlyBrace,
52    SquareParen,
53    Parens,
54}
55
56impl OpenToken {
57    pub fn as_str(&self) -> &'static str {
58        match self {
59            OpenToken::CurlyBrace => "{",
60            OpenToken::SquareParen => "[",
61            OpenToken::Parens => "(",
62        }
63    }
64    pub fn new<'a>(token: Atom<'a>) -> Option<OpenToken> {
65        let token: &str = &token;
66        match (token) {
67            ("{") => Some(OpenToken::CurlyBrace),
68            ("[") => Some(OpenToken::SquareParen),
69            ("(") => Some(OpenToken::Parens),
70            (_) => None,
71        }
72    }
73}
74
75#[derive(Debug, Clone)]
76struct PartialBlock<'a> {
77    open_token: Ann<OpenToken>,
78    children: LinkedList<Node<'a>>,
79}
80
81#[derive(Debug, Clone)]
82enum Branch<'a> {
83    PartialBlock(PartialBlock<'a>),
84    Node(Node<'a>),
85}
86
87#[derive(Debug, Default)]
88pub struct ParseTree<'a> {
89    scopes: VecDeque<PartialBlock<'a>>,
90    finalized: LinkedList<Node<'a>>,
91}
92
93///////////////////////////////////////////////////////////////////////////////
94// PARSE-TREE UTILS
95///////////////////////////////////////////////////////////////////////////////
96
97impl<'a> ParseTree<'a> {
98    fn add_child_node(&mut self, new_node: Node<'a>) {
99        match self.scopes.back_mut() {
100            Some(scope) => {
101                scope.children.push_back(new_node);
102            }
103            None => {
104                self.finalized.push_back(new_node);
105            }
106        }
107    }
108    fn open_new_enclosure(&mut self, new_enclosure: PartialBlock<'a>) {
109        self.scopes.push_back(new_enclosure);
110    }
111    fn close_last_enclosure(&mut self, close_word: &Word<'a>) {
112        match self.scopes.pop_back() {
113            Some(scope) => {
114                let new_node = Enclosure {
115                    kind: EnclosureKind::new(
116                        Cow::Borrowed(scope.open_token.data.as_str()),
117                        Cow::Borrowed(close_word.word),
118                    ),
119                    children: scope.children.into_iter().collect()
120                };
121                let range = {
122                    let start = scope.open_token.start();
123                    let end = close_word.range.end;
124                    CharRange::join(start, Some(end))
125                };
126                self.add_child_node(Node::Enclosure(Ann::join(range, new_node)));
127            }
128            None => {
129                let new_node = Node::InvalidToken(Ann::new(
130                    close_word.range,
131                    Cow::Borrowed(close_word.word),
132                ));
133                self.add_child_node(new_node);
134            }
135        }
136    }
137    pub fn finalize_all(self) -> Vec<Node<'a>> {
138        let ParseTree { mut scopes, mut finalized } = self;
139        let scopes = scopes.drain(..);
140        let xs = scopes
141            .map(|scope| {
142                let enclosure = Enclosure{
143                    kind: EnclosureKind::Error{
144                        open: Cow::Borrowed(scope.open_token.data.as_str()),
145                        close: None
146                    },
147                    children: scope.children.into_iter().collect()
148                };
149                Node::Enclosure(Ann::join(
150                    scope.open_token.range(),
151                    enclosure,
152                ))
153            });
154        finalized.extend(xs);
155        finalized.into_iter().collect()
156    }
157}
158
159///////////////////////////////////////////////////////////////////////////////
160// CORE PARSER ENGINE
161///////////////////////////////////////////////////////////////////////////////
162
163
164impl<'a> ParseTree<'a> {
165    pub fn parse_words(words: Vec<Word<'a>>) -> Vec<Node<'a>> {
166        let mut parse_tree = ParseTree::default();
167        let mut skip_to: Option<usize> = None;
168        for pos in 0..words.len() {
169            if let Some(start_from) = skip_to {
170                if pos <= start_from {
171                    continue;
172                } else {
173                    skip_to = None;
174                }
175            }
176            let forward = |by: usize| {
177                words
178                    .get(pos + by)
179                    .filter(|w| !w.is_whitespace())
180                    .map(|w| (by, w))
181            };
182            let current = &words[pos];
183            let next = {
184                let mut entry = None::<(usize, &Word)>;
185                let words_left = words.len() - pos;
186                for offset in 1..words_left {
187                    assert!(entry.is_none());
188                    entry = forward(offset);
189                    if entry.is_some() {break}
190                }
191                entry
192            };
193            let (mode, consumed) = match_word(
194                current.word,
195                next.map(|(_, x)| x.word)
196            );
197            match mode {
198                Mode::BeginEnclosure {kind} => {
199                    let start_pos = current.range.start;
200                    let new_stack = PartialBlock {
201                        open_token: Ann::new(
202                            current.range,
203                            OpenToken::new(Cow::Borrowed(kind)).unwrap()
204                        ),
205                        children: Default::default(),
206                    };
207                    parse_tree.open_new_enclosure(new_stack);
208                }
209                Mode::EndEnclosure {kind: close_token} => {
210                    parse_tree.close_last_enclosure(current);
211                }
212                Mode::Ident(ident) => {
213                    let start = current.range.start;
214                    let end = next
215                        .map(|x| x.1.range.end)
216                        .unwrap_or(current.range.end);
217                    let new_node = Node::Ident(Ann::new(
218                        CharRange::new(
219                            start,
220                            end,
221                        ),
222                        Atom::Borrowed(ident)
223                    ));
224                    parse_tree.add_child_node(new_node);
225                }
226                Mode::NoOP => {
227                    let new_node = Node::String(Ann::new(
228                        current.range,
229                        Cow::Borrowed(current.word)
230                    ));
231                    parse_tree.add_child_node(new_node);
232                }
233            }
234            // FINALIZE
235            match consumed {
236                ZipperConsumed::Current => (),
237                ZipperConsumed::Right => {
238                    assert!(next.is_some());
239                    let offset = next.unwrap().0;
240                    skip_to = Some(pos + offset);
241                }
242            }
243        }
244        parse_tree.finalize_all()
245    }
246}
247
248
249
250///////////////////////////////////////////////////////////////////////////////
251// PARSER ENTRYPOINT
252///////////////////////////////////////////////////////////////////////////////
253
254
255// MAIN ENTRYPOINT FOR STRING TO PARSER AST 
256pub fn parse_source<'a>(source: &'a str) -> Vec<Node<'a>> {
257    let words = init_words(source, init_characters(source));
258    ParseTree::parse_words(words)
259}
260
261
262///////////////////////////////////////////////////////////////////////////////
263// DEV
264///////////////////////////////////////////////////////////////////////////////
265
266fn process_word<'a>(values: Vec<(CharIndex, &'a str)>) -> Vec<Vec<(CharIndex, &'a str)>> {
267    use itertools::Itertools;
268    values
269        .into_iter()
270        .group_by(|(_, char)| {
271            let char: &str = char;
272            match char {
273                "\\" => true,
274                "{" => true,
275                "}" => true,
276                "[" => true,
277                "]" => true,
278                "(" => true,
279                ")" => true,
280                "=" => true,
281                ">" => true,
282                "_" => true,
283                "^" => true,
284                _ => false
285            }
286        })
287        .into_iter()
288        .flat_map(|(key, group)| -> Vec<Vec<(CharIndex, &str)>> {
289            if key == true {
290                group
291                    .into_iter()
292                    .map(|(ix, ch)| {
293                        vec![(ix, ch)]
294                    })
295                    .collect::<Vec<_>>()
296            } else {
297                vec![group.into_iter().collect::<Vec<_>>()]
298            }
299        })
300        .collect_vec()
301}
302
303
304
305#[derive(Debug, Clone)]
306pub struct Character<'a> {
307    range: CharRange,
308    char: &'a str,
309}
310
311impl<'a> Character<'a> {
312    pub fn is_whitespace(&self) -> bool {
313        self.char.chars().any(|x| x.is_whitespace())
314    }
315}
316
317pub fn init_characters<'a>(source: &'a str) -> Vec<Character<'a>> {
318    use itertools::Itertools;
319    let ending_byte_size = source.len();
320    let words = source
321        .grapheme_indices(true)
322        .enumerate()
323        .map(|(cix, (bix, x))| {
324            let index = CharIndex {
325                byte_index: bix,
326                char_index: cix,
327            };
328            (index, x)
329        })
330        .collect_vec();
331    let mut output = Vec::new();
332    for pos in 0..words.len() {
333        let (start, current) = words[pos];
334        let end = words
335            .get(pos + 1)
336            .map(|(pos, _)| *pos)
337            .unwrap_or_else(|| {
338                CharIndex {
339                    byte_index: ending_byte_size,
340                    char_index: pos + 1
341                }
342            });
343        output.push(Character{
344            range: CharRange{ start, end},
345            char: current
346        });
347    }
348    output
349}
350
351#[derive(Debug, Clone)]
352pub struct Word<'a> {
353    range: CharRange,
354    word: &'a str,
355}
356
357impl<'a> Word<'a> {
358    pub fn is_whitespace(&self) -> bool {
359        self.word.trim().is_empty()
360    }
361}
362
363pub fn init_words<'a>(source: &'a str, chars: Vec<Character<'a>>) -> Vec<Word<'a>> {
364    use itertools::Itertools;
365    // let mut output = Vec::new();
366    let mut current_word_start = 0usize;
367    chars
368        .into_iter()
369        .group_by(|char| {
370            if char.is_whitespace() {
371                return true
372            }
373            match char.char {
374                "\\" => true,
375                "{" => true,
376                "}" => true,
377                "[" => true,
378                "]" => true,
379                "(" => true,
380                ")" => true,
381                "=" => true,
382                ">" => true,
383                "_" => true,
384                "." => true,
385                "^" => true,
386                _ => false
387            }
388        })
389        .into_iter()
390        .flat_map(|(key, chars)| {
391            let chars = chars.into_iter().collect_vec();
392            if key || chars.len() < 2 {
393                let chars = chars
394                    .into_iter()
395                    .map(|char| {
396                        Word {
397                            range: char.range,
398                            word: char.char,
399                        }
400                    })
401                    .collect_vec();
402                return chars;
403            }
404            let start = {
405                (&chars[0]).range.start
406            };
407            let end = {
408                (&chars[chars.len() - 1]).range.end
409            };
410            let word = &source[start.byte_index..end.byte_index];
411            let word = Word {
412                range: CharRange{start, end},
413                word,
414            };
415            vec![word]
416        })
417        .collect::<Vec<_>>()
418}
419
420fn match_word<'a>(current: &'a str, next: Option<&'a str>) -> (Mode<'a>, ZipperConsumed) {
421    match (current, next) {
422        ("\\", Some(next)) if next == "{"  => (
423            Mode::Ident(INLINE_MATH_TAG),
424            ZipperConsumed::Current,
425        ),
426        ("\\", Some(ident)) if !is_token(ident) && ident != " " => (
427            Mode::Ident(ident),
428            ZipperConsumed::Right
429        ),
430        (tk @ "{", _) => (
431            Mode::BeginEnclosure{kind: tk},
432            ZipperConsumed::Current
433        ),
434        (tk @ "[", _) => (
435            Mode::BeginEnclosure{kind: tk},
436            ZipperConsumed::Current
437        ),
438        (tk @ "(", _) => (
439            Mode::BeginEnclosure{kind: tk},
440            ZipperConsumed::Current
441        ),
442        (tk @ "}", _) => (
443            Mode::EndEnclosure{kind: tk},
444            ZipperConsumed::Current
445        ),
446        (tk @ "]", _) => (
447            Mode::EndEnclosure{kind: tk},
448            ZipperConsumed::Current
449        ),
450        (tk @ ")", _) => (
451            Mode::EndEnclosure{kind: tk},
452            ZipperConsumed::Current
453        ),
454        _ => (Mode::NoOP, ZipperConsumed::Current),
455    }
456}
457
458
459
460