petr_parse/
parser.rs

1#[cfg(test)]
2mod tests;
3
4mod lexer;
5use std::rc::Rc;
6
7use lexer::Lexer;
8pub use lexer::Token;
9use miette::{Diagnostic, SourceSpan};
10use petr_ast::{Ast, Comment, ExprId, List, Module};
11use petr_utils::{IndexMap, SourceId, Span, SpannedItem, SymbolId, SymbolInterner};
12use thiserror::Error;
13
14#[derive(Error, Debug, PartialEq)]
15pub struct ParseError {
16    kind: ParseErrorKind,
17    help: Option<String>,
18}
19
20impl From<ParseErrorKind> for ParseError {
21    fn from(kind: ParseErrorKind) -> Self {
22        Self { kind, help: None }
23    }
24}
25
26impl ParseError {
27    pub fn with_help(
28        mut self,
29        help: Option<impl Into<String>>,
30    ) -> Self {
31        self.help = help.map(Into::into);
32        self
33    }
34}
35
36impl Diagnostic for ParseError {
37    fn help<'a>(&'a self) -> Option<Box<dyn std::fmt::Display + 'a>> {
38        self.help.as_ref().map(|x| -> Box<dyn std::fmt::Display> { Box::new(x) })
39    }
40
41    fn code<'a>(&'a self) -> Option<Box<dyn std::fmt::Display + 'a>> {
42        self.kind.code()
43    }
44
45    fn severity(&self) -> Option<miette::Severity> {
46        self.kind.severity()
47    }
48
49    fn url<'a>(&'a self) -> Option<Box<dyn std::fmt::Display + 'a>> {
50        self.kind.url()
51    }
52
53    fn source_code(&self) -> Option<&dyn miette::SourceCode> {
54        self.kind.source_code()
55    }
56
57    fn labels(&self) -> Option<Box<dyn Iterator<Item = miette::LabeledSpan> + '_>> {
58        self.kind.labels()
59    }
60
61    fn related<'a>(&'a self) -> Option<Box<dyn Iterator<Item = &'a dyn Diagnostic> + 'a>> {
62        self.kind.related()
63    }
64
65    fn diagnostic_source(&self) -> Option<&dyn Diagnostic> {
66        self.kind.diagnostic_source()
67    }
68}
69
70impl std::fmt::Display for ParseError {
71    fn fmt(
72        &self,
73        f: &mut std::fmt::Formatter<'_>,
74    ) -> std::fmt::Result {
75        write!(f, "{}", self.kind)
76    }
77}
78
79#[derive(Error, Debug, Diagnostic, PartialEq)]
80pub enum ParseErrorKind {
81    #[error("Unmatched parenthesis")]
82    UnmatchedParenthesis,
83    #[error("Expected identifier, found {0}")]
84    ExpectedIdentifier(String),
85    #[error("Expected token {0}, found {1}")]
86    ExpectedToken(Token, Token),
87    #[error("Expected one of tokens {}; found {1}", format_toks(.0))]
88    ExpectedOneOf(Vec<Token>, Token),
89    #[error("Internal error in parser. Please file an issue on GitHub. {0}")]
90    InternalError(String),
91    #[error("File name could not be converted to module name. petr source names should be a valid identifier.")]
92    #[help(
93        "Identifiers cannot begin with numbers, contain spaces, or contain symbols other than an underscore or hyphen.\
94     Hyphens in file names are transformed into underscores."
95    )]
96    InvalidIdentifier(String),
97    #[error("Internal error in parser. Please file an issue on GitHub. A span was joined with a span from another file.")]
98    InternalSpanError(#[label] SourceSpan, #[label] SourceSpan),
99}
100
101impl ParseErrorKind {
102    pub fn into_err(self) -> ParseError {
103        self.into()
104    }
105}
106
107fn format_toks(toks: &[Token]) -> String {
108    let mut buf = toks.iter().take(toks.len() - 1).map(|t| format!("{}", t)).collect::<Vec<_>>().join(", ");
109    match toks.len() {
110        2 => {
111            buf.push_str(&format!(" or {}", toks.last().unwrap()));
112        },
113        x if x > 2 => {
114            buf.push_str(&format!(", or {}", toks.last().unwrap()));
115        },
116        _ => (),
117    }
118    buf
119}
120
121type Result<T> = std::result::Result<T, ParseError>;
122
123pub struct Parser {
124    interner: SymbolInterner,
125    /// some exprs need to be assigned an ID, because they generate a scope
126    /// which is stored in the binder and needs to be retrieved later
127    expr_id_assigner: usize,
128    lexer: Lexer,
129    errors: Vec<SpannedItem<ParseError>>,
130    comments: Vec<SpannedItem<Comment>>,
131    peek: Option<SpannedItem<Token>>,
132    // the tuple is the file name and content
133    source_map: IndexMap<SourceId, (&'static str, &'static str)>,
134    help: Vec<String>,
135}
136
137impl Parser {
138    pub fn push_error(
139        &mut self,
140        err: SpannedItem<ParseErrorKind>,
141    ) {
142        if self.help.is_empty() {
143            return self.errors.push(err.map(|err| err.into_err()));
144        }
145        let mut help_text = Vec::with_capacity(self.help.len());
146        for (indentation, help) in self.help.iter().enumerate() {
147            let is_last = indentation == self.help.len() - 1;
148            let text = format!(
149                "{}{}{}{help}",
150                "  ".repeat(indentation),
151                if indentation == 0 { "" } else { "↪ " },
152                if is_last { "expected " } else { "while parsing " }
153            );
154            help_text.push(text);
155        }
156        let err = err.map(|err| err.into_err().with_help(Some(help_text.join("\n"))));
157        self.errors.push(err);
158    }
159
160    pub fn slice(&self) -> &str {
161        self.lexer.slice()
162    }
163
164    pub fn intern(
165        &mut self,
166        internee: Rc<str>,
167    ) -> SymbolId {
168        self.interner.insert(internee)
169    }
170
171    pub fn span(&self) -> Span {
172        self.lexer.span()
173    }
174
175    pub fn peek(&mut self) -> SpannedItem<Token> {
176        if let Some(ref peek) = self.peek {
177            *peek
178        } else {
179            let item = self.advance();
180            self.peek = Some(item);
181            item
182        }
183    }
184
185    pub fn new_with_existing_interner_and_source_map(
186        sources: impl IntoIterator<Item = (impl Into<String>, impl Into<String>)>,
187        interner: SymbolInterner,
188        mut source_map: IndexMap<SourceId, (&'static str, &'static str)>,
189    ) -> Self {
190        // TODO we hold two copies of the source for now: one in source_maps, and one outside the parser
191        // for the lexer to hold on to and not have to do self-referential pointers.
192        let sources = sources
193            .into_iter()
194            .map(|(name, source)| -> (&'static str, &'static str) {
195                let name = name.into();
196                let source = source.into();
197                (Box::leak(name.into_boxed_str()), Box::leak(source.into_boxed_str()))
198            })
199            .collect::<Vec<_>>();
200        let sources_for_lexer = sources.iter().map(|(_, source)| *source);
201
202        let lexer = Lexer::new_with_offset_into_sources(sources_for_lexer, source_map.len());
203
204        for (name, source) in sources.into_iter() {
205            source_map.insert((name, source));
206        }
207
208        Self {
209            // reuse the interner if provided, otherwise create a new one
210            interner,
211            lexer,
212            errors: Default::default(),
213            comments: Default::default(),
214            peek: None,
215            source_map,
216            help: Default::default(),
217            expr_id_assigner: 0,
218        }
219    }
220
221    pub fn new_expr_id(&mut self) -> ExprId {
222        let id = self.expr_id_assigner;
223        self.expr_id_assigner += 1;
224        ExprId(id)
225    }
226
227    pub fn new(sources: impl IntoIterator<Item = (impl Into<String>, impl Into<String>)>) -> Self {
228        Self::new_with_existing_interner_and_source_map(sources, Default::default(), Default::default())
229    }
230
231    pub fn drain_comments(&mut self) -> Vec<Comment> {
232        self.comments.drain(..).map(|spanned_item| spanned_item.into_item()).collect()
233    }
234
235    /// consume tokens until a node is produced
236    #[allow(clippy::type_complexity)]
237    pub fn into_result(
238        mut self
239    ) -> (
240        Ast,
241        Vec<SpannedItem<ParseError>>,
242        SymbolInterner,
243        IndexMap<SourceId, (&'static str, &'static str)>,
244    ) {
245        let nodes: Vec<Module> = self.many::<Module>();
246        // drop the lexers from the source map
247        (Ast::new(nodes), self.errors, self.interner, self.source_map)
248    }
249
250    pub fn interner(&self) -> &SymbolInterner {
251        &self.interner
252    }
253
254    pub fn many<P: Parse>(&mut self) -> Vec<P> {
255        let mut buf = Vec::new();
256        loop {
257            if *self.peek().item() == Token::Eof {
258                break;
259            }
260            if let Some(parsed_item) = P::parse(self) {
261                buf.push(parsed_item);
262            } else {
263                break;
264            }
265        }
266        buf
267    }
268
269    /// parses a sequence separated by `separator`
270    /// e.g. if separator is `Token::Comma`, can parse `a, b, c, d`
271    /// NOTE: this parses zero or more items. Will not reject zero items.
272    pub fn sequence_zero_or_more<P: Parse>(
273        &mut self,
274        separator: Token,
275    ) -> Option<Vec<P>> {
276        self.with_help(format!("while parsing {separator} separated sequence"), |p| {
277            let mut buf = vec![];
278            loop {
279                // if there are no items in the buf, we try a backtracking parse in case this is
280                // a zero sequence
281                let item = if buf.is_empty() {
282                    let res = p.with_backtrack(|p| P::parse(p));
283                    match res {
284                        Ok(item) => Some(item),
285                        Err(_) => {
286                            // return the zero sequence
287                            break;
288                        },
289                    }
290                } else {
291                    // there is at least one item in the buf, so we know all parse errors arising
292                    // from `P` are valid
293                    P::parse(p)
294                };
295                match item {
296                    Some(item) => buf.push(item),
297                    None => {
298                        break;
299                    },
300                }
301                if *p.peek().item() == separator {
302                    p.advance();
303                } else {
304                    break;
305                }
306            }
307
308            Some(buf)
309        })
310    }
311
312    /// parses a sequence separated by `separator`
313    /// e.g. if separator is `Token::Comma`, can parse `a, b, c, d`
314    /// NOTE: this parses one or more items. Will reject zero items.
315    /// alias for `sequence`
316    pub fn sequence_one_or_more<P: Parse>(
317        &mut self,
318        separator: Token,
319    ) -> Option<Vec<P>> {
320        self.sequence(separator)
321    }
322
323    /// parses a sequence separated by `separator`
324    /// e.g. if separator is `Token::Comma`, can parse `a, b, c, d`
325    /// NOTE: this parses one or more items. Will reject zero items.
326    pub fn sequence<P: Parse>(
327        &mut self,
328        separator: Token,
329    ) -> Option<Vec<P>> {
330        let mut buf = vec![];
331        let errs = loop {
332            let item = self.with_backtrack(|p| P::parse(p));
333            match item {
334                Ok(item) => buf.push(item),
335                Err(errs) => {
336                    break errs;
337                },
338            }
339            if *self.peek().item() == separator {
340                self.advance();
341            } else {
342                break vec![];
343            }
344        };
345        if buf.is_empty() {
346            for err in errs {
347                self.errors.push(err)
348            }
349            None
350        } else {
351            Some(buf)
352        }
353    }
354
355    pub fn advance(&mut self) -> SpannedItem<Token> {
356        if let Some(tok) = self.peek.take() {
357            return tok;
358        }
359        let next_tok = self.lexer.advance();
360        match *next_tok.item() {
361            Token::Newline => self.advance(),
362            Token::Comment => {
363                if let Some(comment) = self.parse::<SpannedItem<Comment>>() {
364                    self.comments.push(comment);
365                }
366                self.advance()
367            },
368            _ => next_tok,
369        }
370    }
371
372    /// doesn't push the error to the error list and doesn't advance if the token is not found
373    pub fn try_token(
374        &mut self,
375        tok: Token,
376    ) -> Option<SpannedItem<Token>> {
377        let peeked_token = self.peek();
378        if *peeked_token.item() == tok {
379            Some(self.advance())
380        } else {
381            None
382        }
383    }
384
385    pub fn token(
386        &mut self,
387        tok: Token,
388    ) -> Option<SpannedItem<Token>> {
389        self.with_help(format!("while parsing token {tok}"), |p| {
390            let peeked_token = p.peek();
391            if *peeked_token.item() == tok {
392                Some(p.advance())
393            } else {
394                let span = p.lexer.span();
395                p.push_error(span.with_item(ParseErrorKind::ExpectedToken(tok, *peeked_token.item())));
396                None
397            }
398        })
399    }
400
401    pub fn parse<P: Parse>(&mut self) -> Option<P> {
402        P::parse(self)
403    }
404
405    pub fn one_of<const N: usize>(
406        &mut self,
407        toks: [Token; N],
408    ) -> Option<SpannedItem<Token>> {
409        match self.peek().item() {
410            tok if toks.contains(tok) => self.token(*tok),
411            tok => {
412                let span = self.lexer.span();
413                if N == 1 {
414                    self.push_error(span.with_item(ParseErrorKind::ExpectedToken(toks[0], *tok)));
415                } else {
416                    self.push_error(span.with_item(ParseErrorKind::ExpectedOneOf(toks.to_vec(), *tok)));
417                }
418                None
419            },
420        }
421    }
422
423    pub fn errors(&self) -> &[SpannedItem<ParseError>] {
424        &self.errors
425    }
426
427    pub fn with_help<F, T>(
428        &mut self,
429        help_text: impl Into<String>,
430        f: F,
431    ) -> T
432    where
433        F: Fn(&mut Parser) -> T,
434    {
435        self.push_help(help_text);
436        let res = f(self);
437        self.pop_help();
438        res
439    }
440
441    fn push_help(
442        &mut self,
443        arg: impl Into<String>,
444    ) {
445        self.help.push(arg.into())
446    }
447
448    fn pop_help(&mut self) {
449        let _ = self.help.pop();
450    }
451
452    /// Performs a backtracking parse, which means that if the inner function returns `None`,
453    /// the parser will backtrack to the state before the function was called and revert any
454    /// errors that were encountered, returning them as `Err` but crucially not appending them to
455    /// self.errors`.
456    /// Note that this is NOT a performant method, and it should be used sparingly.
457    pub fn with_backtrack<F, T>(
458        &mut self,
459        f: F,
460    ) -> std::result::Result<T, Vec<SpannedItem<ParseError>>>
461    where
462        F: Fn(&mut Parser) -> Option<T>,
463    {
464        let checkpoint = self.checkpoint();
465        let res = f(self);
466        match res {
467            Some(res) => Ok(res),
468            None => Err(self.restore_checkpoint(checkpoint)),
469        }
470    }
471
472    fn checkpoint(&self) -> Checkpoint {
473        Checkpoint {
474            errors: self.errors.len(),
475            lexer:  self.lexer.clone(),
476            peek:   self.peek,
477        }
478    }
479
480    fn restore_checkpoint(
481        &mut self,
482        checkpoint: Checkpoint,
483    ) -> Vec<SpannedItem<ParseError>> {
484        self.lexer = checkpoint.lexer;
485        self.peek = checkpoint.peek;
486        self.errors.split_off(checkpoint.errors)
487    }
488
489    pub fn source_map(&self) -> &IndexMap<SourceId, (&'static str, &'static str)> {
490        &self.source_map
491    }
492}
493
494struct Checkpoint {
495    errors: usize,
496    lexer:  Lexer,
497    peek:   Option<SpannedItem<Token>>,
498}
499
500pub trait Parse: Sized {
501    fn parse(p: &mut Parser) -> Option<Self>;
502}
503
504impl<T> Parse for SpannedItem<T>
505where
506    T: Parse,
507{
508    fn parse(p: &mut Parser) -> Option<Self> {
509        let before_span = p.lexer.span();
510        let result = T::parse(p)?;
511
512        let mut after_span = p.lexer.span();
513        if after_span.source() != before_span.source() && after_span.span().offset() == 0 {
514            // this is acceptable only if after_span.pos == 0, 0 OR if after_span.pos <= the first
515            // whitespace of the next file, which is less likely to be hit and this function
516            // doesn't account for (yet).
517            //
518            // Sometimes, the span actually goes to the last character of the previous file
519            // this solves the case where the parse consumes all white space until the next token,
520            // which will be in the next file
521            //
522            // TODO: it would be better if we were smarter about spans, and ideally
523            // a span going into the next file wouldn't be generated in this scenario.
524            // assignee: sezna
525            after_span = before_span.extend(p.source_map.get(before_span.source()).1.len());
526        } else if after_span.source() != before_span.source() {
527            let span = before_span.with_item(ParseErrorKind::InternalSpanError(after_span.span(), before_span.span()));
528            p.push_error(span);
529            return None;
530        }
531
532        // i think this should be `hi` to `hi`, not 100% though
533        Some(before_span.hi_to_hi(after_span).with_item(result))
534    }
535}
536
537impl Parse for List {
538    fn parse(p: &mut Parser) -> Option<Self> {
539        p.try_token(Token::OpenBracket)?;
540        let elements = p.sequence(Token::Comma)?;
541        p.token(Token::CloseBracket)?;
542        Some(List {
543            elements: elements.into_boxed_slice(),
544        })
545    }
546}