ketos/
parser.rs

1//! Parses a series of `lexer` tokens into a code expression.
2
3use std::borrow::Cow::{self, Borrowed, Owned};
4use std::collections::HashMap;
5use std::fmt;
6use std::path::PathBuf;
7
8use crate::bytes::Bytes;
9use crate::error::Error;
10use crate::exec::Context;
11use crate::integer::{Integer, Ratio};
12use crate::lexer::{Lexer, Span, Token};
13use crate::name::{get_standard_name_for, standard_names, Name, NameDisplay, NameStore};
14use crate::restrict::RestrictError;
15use crate::string;
16use crate::value::Value;
17
18const MODULE_DOC_COMMENT: &str = ";;;";
19
20/// Parses a stream of tokens into an expression.
21pub struct Parser<'a, 'lex> {
22    lexer: Lexer<'lex>,
23    ctx: &'a Context,
24    name_cache: HashMap<&'lex str, Name>,
25    cur_token: Option<(Span, Token<'lex>)>,
26}
27
28/// Represents an error in parsing input.
29#[derive(Copy, Clone, Debug, Eq, PartialEq)]
30pub struct ParseError {
31    /// Span of source code which caused the error
32    pub span: Span,
33    /// Kind of error generated
34    pub kind: ParseErrorKind,
35}
36
37impl ParseError {
38    /// Creates a new `ParseError`.
39    pub fn new(span: Span, kind: ParseErrorKind) -> ParseError {
40        ParseError{ span, kind }
41    }
42}
43
44impl fmt::Display for ParseError {
45    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
46        fmt::Display::fmt(&self.kind, f)
47    }
48}
49
50impl NameDisplay for ParseError {
51    fn fmt(&self, _names: &NameStore, f: &mut fmt::Formatter) -> fmt::Result {
52        fmt::Display::fmt(self, f)
53    }
54}
55
56/// Describes the kind of error encountered in parsing.
57#[derive(Copy, Clone, Debug, Eq, PartialEq)]
58pub enum ParseErrorKind {
59    /// Doc comment followed by incompatible token
60    CannotDocumentItem,
61    /// Doc comment at end-of-file
62    DocCommentEof,
63    /// Error in parsing literal
64    InvalidLiteral,
65    /// Error in parsing token
66    InvalidToken,
67    /// Invalid character in byte or byte string literal
68    InvalidByte(char),
69    /// Invalid escape sequence in byte or byte string literal
70    InvalidByteEscape(char),
71    /// Invalid character in input
72    InvalidChar(char),
73    /// Invalid character in numeric escape sequence `\xNN` or `\u{NNNN}`
74    InvalidNumericEscape(char),
75    /// Error parsing literal string into value
76    LiteralParseError,
77    /// Missing closing parenthesis
78    MissingCloseParen,
79    /// More commas than backquotes
80    UnbalancedComma,
81    /// Unexpected end-of-file
82    UnexpectedEof,
83    /// Unexpected token
84    UnexpectedToken{
85        /// Token or category of token expected
86        expected: &'static str,
87        /// Token found
88        found: &'static str,
89    },
90    /// Unrecognized character escape
91    UnknownCharEscape(char),
92    /// Unmatched `)`
93    UnmatchedParen,
94    /// Unterminated character constant
95    UnterminatedChar,
96    /// Unterminated block comment
97    UnterminatedComment,
98    /// Unterminated string constant
99    UnterminatedString,
100}
101
102impl fmt::Display for ParseErrorKind {
103    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
104        match *self {
105            ParseErrorKind::CannotDocumentItem =>
106                f.write_str("doc comment precedes item that cannot be documented"),
107            ParseErrorKind::DocCommentEof => f.write_str("doc comment at end-of-file"),
108            ParseErrorKind::InvalidLiteral => f.write_str("invalid numeric literal"),
109            ParseErrorKind::InvalidToken => f.write_str("invalid token"),
110            ParseErrorKind::InvalidByte(ch) =>
111                write!(f, "byte literal must be ASCII: {:?}", ch),
112            ParseErrorKind::InvalidByteEscape(ch) =>
113                write!(f, "invalid escape sequence in byte literal: {:?}", ch),
114            ParseErrorKind::InvalidChar(ch) =>
115                write!(f, "invalid character: {:?}", ch),
116            ParseErrorKind::InvalidNumericEscape(ch) =>
117                write!(f, "invalid character in escape sequence: {:?}", ch),
118            ParseErrorKind::LiteralParseError => f.write_str("literal parse error"),
119            ParseErrorKind::MissingCloseParen => f.write_str("missing close paren"),
120            ParseErrorKind::UnbalancedComma => f.write_str("unbalanced ` and ,"),
121            ParseErrorKind::UnexpectedEof => f.write_str("unexpected end-of-file"),
122            ParseErrorKind::UnexpectedToken{expected, found} =>
123                write!(f, "expected {}; found {}", expected, found),
124            ParseErrorKind::UnknownCharEscape(ch) =>
125                write!(f, "unknown char escape: {:?}", ch),
126            ParseErrorKind::UnmatchedParen => f.write_str("unmatched `)`"),
127            ParseErrorKind::UnterminatedChar => f.write_str("unterminated char constant"),
128            ParseErrorKind::UnterminatedComment => f.write_str("unterminated block comment"),
129            ParseErrorKind::UnterminatedString => f.write_str("unterminated string constant"),
130        }
131    }
132}
133
134enum Group<'lex> {
135    /// Positive indicates a number of backticks,
136    /// negative indicates a number of commas.
137    Backticks(i32),
138    CommaAt,
139    /// Number of quotes preceding group.
140    /// If zero, this is an unquoted parentheses group.
141    Quotes(u32),
142    /// Values in a parenthetical expression
143    Parens(Vec<Value>, Option<(Span, &'lex str)>),
144}
145
146impl<'a, 'lex> Parser<'a, 'lex> {
147    /// Creates a new `Parser` using the given `Lexer`.
148    /// Identifiers received from the lexer will be inserted into the given context.
149    pub fn new(ctx: &'a Context, lexer: Lexer<'lex>) -> Parser<'a, 'lex> {
150        Parser{
151            lexer,
152            ctx,
153            name_cache: HashMap::new(),
154            cur_token: None,
155        }
156    }
157
158    /// Skips the "shebang" line of a source file.
159    pub fn skip_shebang(&mut self) {
160        self.lexer.skip_shebang();
161    }
162
163    /// Parses an expression from the input stream.
164    pub fn parse_expr(&mut self) -> Result<Value, Error> {
165        let mut stack = Vec::new();
166        let mut total_backticks = 0;
167
168        loop {
169            if stack.len() >= self.ctx.restrict().max_syntax_nesting {
170                return Err(From::from(RestrictError::MaxSyntaxNestingExceeded));
171            }
172
173            let mut doc = self.read_doc_comment()?;
174
175            let (sp, tok) = self.next()?;
176
177            let r = match tok {
178                Token::DocComment(_) =>
179                    return Err(From::from(ParseError::new(
180                        doc.unwrap().0, ParseErrorKind::CannotDocumentItem))),
181                Token::LeftParen => {
182                    if let Some((doc_sp, _)) = doc {
183                        match self.peek()? {
184                            (_, Token::Name("const")) |
185                            (_, Token::Name("define")) |
186                            (_, Token::Name("lambda")) |
187                            (_, Token::Name("macro")) |
188                            (_, Token::Name("struct"))
189                                => (),
190                            _ => return Err(From::from(ParseError::new(
191                                doc_sp, ParseErrorKind::CannotDocumentItem)))
192                        }
193                    }
194
195                    stack.push(Group::Parens(Vec::new(), doc.take()));
196                    continue;
197                }
198                Token::RightParen => {
199                    let group = stack.pop().ok_or_else(
200                        || ParseError::new(sp, ParseErrorKind::UnmatchedParen))?;
201
202                    match group {
203                        Group::Parens(values, doc) =>
204                            insert_doc_comment(values, doc)
205                                .map_err(From::from),
206                        _ => Err(From::from(ParseError::new(sp,
207                            ParseErrorKind::UnexpectedToken{
208                                expected: "expression",
209                                found: ")",
210                            })))
211                    }
212                }
213                Token::Float(f) => parse_float(f)
214                    .map(Value::Float)
215                    .map_err(|kind| From::from(ParseError::new(sp, kind))),
216                Token::Integer(i, base) => parse_integer(self.ctx, i, base, sp)
217                    .map(Value::Integer),
218                Token::Ratio(r) => parse_ratio(self.ctx, r, sp)
219                    .map(Value::Ratio),
220                Token::Char(ch) => parse_char(ch)
221                    .map(Value::Char).map_err(From::from),
222                Token::String(s) => parse_string(s)
223                    .map(|s| s.into()).map_err(From::from),
224                Token::Byte(b) => parse_byte(b)
225                    .map(|v| v.into()).map_err(From::from),
226                Token::Bytes(b) => parse_bytes(b)
227                    .map(Value::Bytes).map_err(From::from),
228                Token::Path(p) => parse_path(p)
229                    .map(|p| p.into()).map_err(From::from),
230                Token::Name(name) => Ok(self.name_value(name)),
231                Token::Keyword(name) => Ok(Value::Keyword(self.add_name(name))),
232                Token::BackQuote => {
233                    total_backticks += 1;
234                    if let Some(&mut Group::Backticks(ref mut n)) = stack.last_mut() {
235                        *n += 1;
236                        continue;
237                    }
238                    stack.push(Group::Backticks(1));
239                    continue;
240                }
241                Token::Comma => {
242                    if total_backticks <= 0 {
243                        return Err(From::from(ParseError::new(sp, ParseErrorKind::UnbalancedComma)));
244                    }
245                    total_backticks -= 1;
246                    if let Some(&mut Group::Backticks(ref mut n)) = stack.last_mut() {
247                        *n -= 1;
248                        continue;
249                    }
250                    stack.push(Group::Backticks(-1));
251                    continue;
252                }
253                Token::CommaAt => {
254                    if total_backticks <= 0 {
255                        return Err(From::from(ParseError::new(sp, ParseErrorKind::UnbalancedComma)));
256                    }
257                    total_backticks -= 1;
258                    stack.push(Group::CommaAt);
259                    continue;
260                }
261                Token::Quote => {
262                    if let Some(&mut Group::Quotes(ref mut n)) = stack.last_mut() {
263                        *n += 1;
264                        continue;
265                    }
266                    stack.push(Group::Quotes(1));
267                    continue;
268                }
269                Token::End => {
270                    if let Some((doc_sp, _)) = doc {
271                        return Err(From::from(ParseError::new(doc_sp,
272                            ParseErrorKind::DocCommentEof)));
273                    }
274
275                    let any_paren = stack.iter().any(|group| {
276                        match *group {
277                            Group::Parens(_, _) => true,
278                            _ => false
279                        }
280                    });
281
282                    if any_paren {
283                        Err(From::from(ParseError::new(sp,
284                            ParseErrorKind::MissingCloseParen)))
285                    } else {
286                        Err(From::from(ParseError::new(sp,
287                            ParseErrorKind::UnexpectedEof)))
288                    }
289                }
290            };
291
292            if let Some((doc_sp, _)) = doc {
293                return Err(From::from(ParseError::new(doc_sp,
294                    ParseErrorKind::CannotDocumentItem)));
295            }
296
297            let mut v = r?;
298
299            loop {
300                match stack.last_mut() {
301                    None => return Ok(v),
302                    Some(&mut Group::Parens(ref mut values, _)) => {
303                        values.push(v);
304                        break;
305                    }
306                    _ => ()
307                }
308
309                let group = stack.pop().unwrap();
310
311                match group {
312                    // 0 backticks is ignored, but must still be considered
313                    // a group because an expression must follow.
314                    Group::Backticks(n) if n > 0 => {
315                        total_backticks -= n;
316                        v = v.quasiquote(n as u32);
317                    }
318                    Group::Backticks(n) if n < 0 => {
319                        total_backticks -= n; // Subtracting a negative
320                        v = v.comma((-n) as u32);
321                    }
322                    Group::CommaAt => {
323                        total_backticks += 1;
324                        v = v.comma_at(1);
325                    }
326                    Group::Quotes(n) => v = v.quote(n),
327                    _ => ()
328                }
329            }
330        }
331    }
332
333    /// Parses a single expression from the input stream.
334    /// If any tokens remain after the expression, an error is returned.
335    pub fn parse_single_expr(&mut self) -> Result<Value, Error> {
336        let expr = self.parse_expr()?;
337
338        match self.next()? {
339            (_, Token::End) => Ok(expr),
340            (sp, tok) => Err(From::from(ParseError::new(sp, ParseErrorKind::UnexpectedToken{
341                expected: "eof",
342                found: tok.name(),
343            })))
344        }
345    }
346
347    /// Parse a series of expressions from the input stream.
348    pub fn parse_exprs(&mut self) -> Result<Vec<Value>, Error> {
349        let mut res = Vec::new();
350
351        if let Some((_, doc)) = self.read_module_doc_comment()? {
352            res.push(vec![
353                Value::Name(standard_names::SET_MODULE_DOC),
354                format_doc_comment(doc).into(),
355            ].into());
356        }
357
358        loop {
359            match self.peek()? {
360                (_sp, Token::End) => break,
361                _ => res.push(self.parse_expr()?)
362            }
363        }
364
365        Ok(res)
366    }
367
368    /// Returns a borrowed reference to the contained `Lexer`.
369    pub fn lexer(&self) -> &Lexer<'lex> {
370        &self.lexer
371    }
372
373    /// Returns the the next token if it is a doc comment.
374    /// Otherwise, `None` is returned and no token is consumed.
375    fn read_doc_comment(&mut self)
376            -> Result<Option<(Span, &'lex str)>, ParseError> {
377        match self.peek()? {
378            (sp, Token::DocComment(doc)) => {
379                self.cur_token.take();
380                Ok(Some((sp, doc)))
381            }
382            _ => Ok(None)
383        }
384    }
385
386    /// Returns the content of the next token if it is a module-level doc
387    /// comment, one beginning with at least three semicolon characters.
388    fn read_module_doc_comment(&mut self)
389            -> Result<Option<(Span, &'lex str)>, ParseError> {
390        match self.peek()? {
391            (sp, Token::DocComment(doc)) if doc.starts_with(MODULE_DOC_COMMENT) => {
392                self.cur_token.take();
393                Ok(Some((sp, doc)))
394            }
395            _ => Ok(None)
396        }
397    }
398
399    fn add_name(&mut self, name: &'lex str) -> Name {
400        let mut names = self.ctx.scope().borrow_names_mut();
401        *self.name_cache.entry(name).or_insert_with(
402            || get_standard_name_for(name).unwrap_or_else(|| names.add(name)))
403    }
404
405    fn name_value(&mut self, name: &'lex str) -> Value {
406        match name {
407            "true" => Value::Bool(true),
408            "false" => Value::Bool(false),
409            _ => {
410                let name = self.add_name(name);
411                Value::Name(name)
412            }
413        }
414    }
415
416    fn next(&mut self) -> Result<(Span, Token<'lex>), ParseError> {
417        let r = self.peek()?;
418        self.cur_token = None;
419        Ok(r)
420    }
421
422    /// Returns the next token without consuming it
423    fn peek(&mut self) -> Result<(Span, Token<'lex>), ParseError> {
424        if let Some(tok) = self.cur_token {
425            Ok(tok)
426        } else {
427            let tok = self.lexer.next_token()?;
428            self.cur_token = Some(tok);
429            Ok(tok)
430        }
431    }
432}
433
434fn format_doc_comment(doc: &str) -> String {
435    let mut buf = String::new();
436
437    for line in doc.lines() {
438        // Multi-line doc comments may contain "  ;; foo",
439        // so we strip leading whitespace, semicolons, then one whitespace char.
440        buf.push_str(trim_first(
441            line.trim_start_matches(char::is_whitespace)
442                .trim_start_matches(';'), char::is_whitespace));
443        buf.push('\n');
444    }
445
446    buf
447}
448
449fn trim_first<F: FnOnce(char) -> bool>(s: &str, f: F) -> &str {
450    let mut chars = s.chars();
451
452    match chars.next() {
453        Some(ch) if f(ch) => chars.as_str(),
454        _ => s
455    }
456}
457
458fn insert_doc_comment(mut items: Vec<Value>, doc: Option<(Span, &str)>)
459        -> Result<Value, ParseError> {
460    if let Some((sp, doc)) = doc {
461        if items.len() == 3 {
462            items.insert(2, format_doc_comment(doc).into());
463        } else {
464            return Err(ParseError::new(sp, ParseErrorKind::CannotDocumentItem));
465        }
466    }
467
468    Ok(items.into())
469}
470
471fn parse_byte(s: &str) -> Result<u8, ParseError> {
472    let (b, _) = string::parse_byte(s, 0)?;
473    Ok(b)
474}
475
476fn parse_char(s: &str) -> Result<char, ParseError> {
477    let (ch, _) = string::parse_char(s, 0)?;
478    Ok(ch)
479}
480
481fn parse_bytes(s: &str) -> Result<Bytes, ParseError> {
482    let (b, _) = if s.starts_with("#br") {
483        string::parse_raw_byte_string(&s[2..], 0)?
484    } else {
485        string::parse_byte_string(&s[2..], 0)?
486    };
487    Ok(Bytes::new(b))
488}
489
490fn parse_path(s: &str) -> Result<PathBuf, ParseError> {
491    let (s, _) = if s.starts_with("#pr") {
492        string::parse_raw_string(&s[2..], 0)?
493    } else {
494        string::parse_string(&s[2..], 0)?
495    };
496    Ok(PathBuf::from(s))
497}
498
499fn parse_string(s: &str) -> Result<String, ParseError> {
500    let (s, _) = if s.starts_with('r') {
501        string::parse_raw_string(s, 0)?
502    } else {
503        string::parse_string(s, 0)?
504    };
505    Ok(s)
506}
507
508fn parse_float(s: &str) -> Result<f64, ParseErrorKind> {
509    strip_underscores(s).parse()
510        .map_err(|_| ParseErrorKind::LiteralParseError)
511}
512
513fn parse_integer(ctx: &Context, s: &str, base: u32, sp: Span)
514        -> Result<Integer, Error> {
515    let s = match base {
516        10 => s,
517        _ => &s[2..]
518    };
519
520    let s = strip_underscores(s);
521
522    check_integer(ctx, &s, base)?;
523
524    Integer::from_str_radix(&s, base)
525        .map_err(|_| From::from(ParseError::new(sp,
526            ParseErrorKind::LiteralParseError)))
527}
528
529fn parse_ratio(ctx: &Context, s: &str, sp: Span) -> Result<Ratio, Error> {
530    let s = strip_underscores(s);
531
532    check_integer(ctx, &s, 10)?;
533
534    s.parse().map_err(|_| From::from(ParseError::new(sp,
535        ParseErrorKind::LiteralParseError)))
536}
537
538fn check_integer(ctx: &Context, mut s: &str, base: u32) -> Result<(), RestrictError> {
539    let limit = ctx.restrict().max_integer_size;
540
541    if limit == usize::max_value() {
542        return Ok(());
543    }
544
545    if s.starts_with('-') {
546        s = &s[1..].trim_start_matches('0');
547    } else {
548        s = s.trim_start_matches('0');
549    }
550
551    // Approximate the number of bits that could be represented by a number of bytes.
552    let n_bits = (s.len() as f32 * (base as f32).log2()).ceil() as usize;
553
554    if n_bits > limit {
555        Err(RestrictError::IntegerLimitExceeded)
556    } else {
557        Ok(())
558    }
559}
560
561fn strip_underscores(s: &str) -> Cow<str> {
562    if s.contains('_') {
563        Owned(s.chars().filter(|&ch| ch != '_').collect())
564    } else {
565        Borrowed(s)
566    }
567}
568
569#[cfg(test)]
570mod test {
571    use super::{ParseError, ParseErrorKind, Parser};
572    use crate::error::Error;
573    use crate::interpreter::Interpreter;
574    use crate::lexer::{Span, Lexer};
575    use crate::value::Value;
576
577    fn parse(s: &str) -> Result<Value, ParseError> {
578        let interp = Interpreter::new();
579
580        let mut p = Parser::new(interp.context(), Lexer::new(s, 0));
581        p.parse_single_expr().map_err(|e| {
582            match e {
583                Error::ParseError(e) => e,
584                _ => panic!("parse returned error: {:?}", e)
585            }
586        })
587    }
588
589    #[test]
590    fn test_doc_errors() {
591        assert_eq!(parse(";; foo\n(const)").unwrap_err(), ParseError{
592            span: Span{lo: 0, hi: 7}, kind: ParseErrorKind::CannotDocumentItem});
593        assert_eq!(parse(";; foo\n(const foo)").unwrap_err(), ParseError{
594            span: Span{lo: 0, hi: 7}, kind: ParseErrorKind::CannotDocumentItem});
595        assert_eq!(parse(";; foo\n(const foo bar baz)").unwrap_err(), ParseError{
596            span: Span{lo: 0, hi: 7}, kind: ParseErrorKind::CannotDocumentItem});
597    }
598
599    #[test]
600    fn test_errors() {
601        assert_eq!(parse("(foo").unwrap_err(), ParseError{
602            span: Span{lo: 4, hi: 4}, kind: ParseErrorKind::MissingCloseParen});
603        assert_eq!(parse("(foo ,bar)").unwrap_err(), ParseError{
604            span: Span{lo: 5, hi: 6}, kind: ParseErrorKind::UnbalancedComma});
605        assert_eq!(parse("`(foo ,,bar)").unwrap_err(), ParseError{
606            span: Span{lo: 7, hi: 8}, kind: ParseErrorKind::UnbalancedComma});
607    }
608
609    #[test]
610    fn test_lexer_position() {
611        let interp = Interpreter::new();
612
613        let mut p = Parser::new(interp.context(),
614            Lexer::new("(foo 1 2 3) bar", 0));
615
616        p.parse_expr().unwrap();
617
618        assert_eq!(p.lexer().current_position(), 11);
619    }
620}