marwood/
parse.rs

1use crate::cell::Cell;
2use crate::char::named_to_char;
3use crate::lex::TokenType::NumberPrefix;
4use crate::lex::{Token, TokenType};
5use crate::number::{Exactness, Number};
6use crate::parse::Error::{
7    ExpectedListTerminator, ExpectedVectorTerminator, Incomplete, UnexpectedToken, UnknownChar,
8};
9use crate::{lex, list};
10use std::iter::Peekable;
11
12#[derive(thiserror::Error, Debug, Eq, PartialEq)]
13pub enum Error {
14    #[error("incomplete")]
15    Incomplete,
16    #[error("unexpected token '{0}'")]
17    UnexpectedToken(String),
18    #[error("unexpected one token after .")]
19    ExpectedOneTokenAfterDot,
20    #[error("expected at least one token before .")]
21    ExpectedTokenBeforeDot,
22    #[error("expected list terminator {0}, but encountered {1}")]
23    ExpectedListTerminator(char, char),
24    #[error("expected vector terminator ), but encountered {0}")]
25    ExpectedVectorTerminator(char),
26    #[error("syntax error: {0}")]
27    SyntaxError(String),
28    #[error("unknown character {0}")]
29    UnknownChar(String),
30    #[error(transparent)]
31    LexError(#[from] lex::Error),
32}
33
34/// Parse text
35///
36/// Tokenize and parse one expression, returning the resulting Cell
37/// and any remaining text, or Error if an error occurred.
38///
39/// # Arguments
40/// *`text` - the text to parse
41pub fn parse_text(text: &str) -> Result<(Cell, Option<&str>), Error> {
42    let tokens = lex::scan(text)?;
43    let mut cur = tokens.iter().peekable();
44    let cell = parse(text, &mut cur)?;
45
46    let remaining_text = cur.peek().map(|Token { span, .. }| &text[span.0..]);
47
48    Ok((cell, remaining_text))
49}
50
51/// Parse one expression from the token stream.
52///
53/// # Arguments
54/// *`cur` - an iterator over the token stream. The parser will only
55///          advance the iterator enough to satisfy one expression.
56/// *`text` - the text backed by the token spans.
57pub fn parse<'a, T: Iterator<Item = &'a Token>>(
58    text: &str,
59    cur: &mut Peekable<T>,
60) -> Result<Cell, Error> {
61    let token = match cur.next() {
62        Some(token) => token,
63        None => return Err(Error::Incomplete),
64    };
65    match token.token_type {
66        TokenType::SingleQuote => Ok(list!["quote", parse(text, cur)?]),
67        TokenType::Quasiquote => Ok(list!["quasiquote", parse(text, cur)?]),
68        TokenType::Unquote => Ok(list!["unquote", parse(text, cur)?]),
69        TokenType::RightParen => Err(Error::UnexpectedToken(")".into())),
70        TokenType::LeftParen => parse_list(text, cur, token),
71        TokenType::HashParen => parse_vector(text, cur),
72        TokenType::True => Ok(Cell::Bool(true)),
73        TokenType::False => Ok(Cell::Bool(false)),
74        TokenType::Char => parse_char(text, token),
75        TokenType::String => parse_string(match token.span(text) {
76            "\"\"" => "",
77            span => &span[1..span.len() - 1],
78        }),
79        TokenType::Symbol => Ok(Cell::new_symbol(token.span(text))),
80        TokenType::NumberPrefix | TokenType::Number => parse_number(text, cur, token),
81        TokenType::Dot | TokenType::WhiteSpace => {
82            Err(Error::UnexpectedToken(token.span(text).into()))
83        }
84    }
85}
86
87/// Parse List
88///
89/// This function is called by a parser that's encountered a '('.
90/// It will recursively parse every value in the list until
91/// it encounters a ')' or a '.', the later of which calls
92/// parse_improper_list_tail() to finish parsing the dotted
93/// form of a list.
94///
95/// # Arguments
96/// *`cur` - an iterator over the token stream. The parser will only
97///          advance the iterator enough to satisfy one expression.
98/// *`text` - the text backed by the token spans.
99/// * `start_token` - The start of list token, used to match the end of
100///     list token.
101fn parse_list<'a, T: Iterator<Item = &'a Token>>(
102    text: &str,
103    cur: &mut Peekable<T>,
104    start_token: &Token,
105) -> Result<Cell, Error> {
106    let mut list = vec![];
107    loop {
108        match cur.peek().ok_or(Error::Incomplete)?.token_type {
109            TokenType::RightParen => {
110                let start_token = start_token.span(text).chars().next().unwrap();
111                let end_token = cur.next().unwrap().span(text).chars().next().unwrap();
112                if !match start_token {
113                    '(' => end_token == ')',
114                    '[' => end_token == ']',
115                    '{' => end_token == '}',
116                    _ => false,
117                } {
118                    return Err(ExpectedListTerminator(start_token, end_token));
119                }
120                return Ok(Cell::new_list(list));
121            }
122            TokenType::Dot => {
123                cur.next();
124                return parse_improper_list_tail(list, text, cur);
125            }
126            _ => {
127                list.push(parse(text, cur)?);
128            }
129        }
130    }
131}
132
133/// Parse Improper List Tail
134///
135/// This function is called by a parser that has encounted a '.'
136/// when parsing a list.
137///
138/// # Arguments
139/// `list` - the constructed list up to the .
140/// `text` - the text backed by the token spans.
141/// `cur` - a cursor pointing to the position in the token stream
142///         immediately after the encountered '.'
143fn parse_improper_list_tail<'a, T: Iterator<Item = &'a Token>>(
144    list: Vec<Cell>,
145    text: &str,
146    cur: &mut Peekable<T>,
147) -> Result<Cell, Error> {
148    // At least one value must be read before the dot
149    if list.is_empty() {
150        return Err(Error::ExpectedTokenBeforeDot);
151    }
152
153    // Exactly one value must be parsed after the dot
154    let last_cdr = match cur.peek().ok_or(Error::Incomplete)?.token_type {
155        TokenType::Dot | TokenType::RightParen => Err(Error::ExpectedOneTokenAfterDot),
156        _ => Ok(parse(text, cur)?),
157    }?;
158
159    // The next token must be a ')'
160    match cur.next().ok_or(Error::Incomplete)?.token_type {
161        TokenType::RightParen => Ok(Cell::new_improper_list(list, last_cdr)),
162        _ => Err(Error::ExpectedOneTokenAfterDot),
163    }
164}
165
166/// Vector
167///
168/// This function is called by a parser that's encountered a '('.
169/// It will recursively parse every value in the list until
170/// it encounters a ')' or a '.', the later of which calls
171/// parse_improper_list_tail() to finish parsing the dotted
172/// form of a list.
173///
174/// # Arguments
175/// *`cur` - an iterator over the token stream. The parser will only
176///          advance the iterator enough to satisfy one expression.
177/// *`text` - the text backed by the token spans.
178/// * `start_token` - The start of list token, used to match the end of
179///     list token.
180fn parse_vector<'a, T: Iterator<Item = &'a Token>>(
181    text: &str,
182    cur: &mut Peekable<T>,
183) -> Result<Cell, Error> {
184    let mut vector = vec![];
185    loop {
186        match cur.peek().ok_or(Error::Incomplete)?.token_type {
187            TokenType::RightParen => {
188                let end_token = cur.next().unwrap().span(text).chars().next().unwrap();
189                if end_token != ')' {
190                    return Err(ExpectedVectorTerminator(end_token));
191                }
192                return Ok(Cell::Vector(vector));
193            }
194            TokenType::Dot => {
195                return Err(UnexpectedToken(".".into()));
196            }
197            _ => {
198                vector.push(parse(text, cur)?);
199            }
200        }
201    }
202}
203
204/// Parse Char
205///
206/// Parse the character token, skipping the character prefix
207/// #\. If only one character is present after the prefix then
208/// this is a literal character. If not, then treat this as a
209/// "named" character of which only 'space' and 'newline' are
210/// recognized.
211fn parse_char(text: &str, token: &Token) -> Result<Cell, Error> {
212    let span = token.span(text);
213    let span = &span[2..span.len()];
214    if span.chars().count() == 1 {
215        Ok(Cell::Char(span.chars().next().unwrap()))
216    } else if span.starts_with('x') && span[1..span.len()].chars().all(|it| it.is_ascii_hexdigit())
217    {
218        let c =
219            u32::from_str_radix(&span[1..span.len()], 16).map_err(|_| UnknownChar(span.into()))?;
220        let c = char::from_u32(c).ok_or_else(|| UnknownChar(span.into()))?;
221        Ok(Cell::Char(c))
222    } else {
223        named_to_char(span)
224            .map(Cell::Char)
225            .ok_or_else(|| Error::UnknownChar(span.into()))
226    }
227}
228
229/// Parse String
230///
231/// Parse the literal string, processing any espace (e.g. \").
232pub fn parse_string(span: &str) -> Result<Cell, Error> {
233    let mut cur = span.chars().peekable();
234    let mut output = String::new();
235    while let Some(c) = cur.next() {
236        if c == '\\' {
237            output.push(match cur.peek() {
238                Some('\\') => '\\',
239                Some('a') => char::from_u32(0x7).unwrap(),
240                Some('b') => char::from_u32(0x8).unwrap(),
241                Some('e') => char::from_u32(0x1b).unwrap(),
242                Some('t') => '\t',
243                Some('n') => '\n',
244                Some('r') => '\r',
245                Some('v') => char::from_u32(0xb).unwrap(),
246                Some('f') => char::from_u32(0xc).unwrap(),
247                Some('x') => {
248                    cur.next();
249                    let mut unicode_value = 0_u32;
250                    loop {
251                        match cur.peek() {
252                            Some(';') => break,
253                            Some(c) if c.is_ascii_hexdigit() => {
254                                unicode_value = unicode_value.checked_mul(16).ok_or_else(|| {
255                                    Error::SyntaxError("overflow when parsing \\x".into())
256                                })?;
257                                unicode_value = unicode_value
258                                    .checked_add(c.to_digit(16).unwrap())
259                                    .ok_or_else(|| {
260                                        Error::SyntaxError("overflow when parsing \\x".into())
261                                    })?;
262                            }
263                            Some(c) => {
264                                return Err(Error::SyntaxError(format!(
265                                    "\\x expected hex digits but encountered '{}' in \"{}\"",
266                                    c, span
267                                )))
268                            }
269                            None => {
270                                return Err(Error::SyntaxError(format!(
271                                    "\\x must be terminated with ; in \"{}\"",
272                                    span
273                                )))
274                            }
275                        }
276                        cur.next();
277                    }
278                    char::from_u32(unicode_value).ok_or_else(|| {
279                        Error::SyntaxError(format!("\\#x{:x}; is not valid unicode", unicode_value))
280                    })?
281                }
282                Some(c) => *c,
283                None => return Err(Error::Incomplete),
284            });
285            cur.next();
286        } else {
287            output.push(c);
288        }
289    }
290
291    Ok(Cell::String(output))
292}
293
294/// Parse Number
295///
296/// This function is called by a parser that has encountered a number
297/// token.
298///
299/// If this function is unable to parse the number it is treated as
300/// as symbol.
301///
302/// # Arguments
303/// *`cur` - an iterator over the token stream. The parser will only
304///          advance the iterator enough to satisfy one expression.
305/// *`text` - the text backed by the token spans.
306fn parse_number<'a, T: Iterator<Item = &'a Token>>(
307    text: &str,
308    cur: &mut Peekable<T>,
309    mut token: &'a Token,
310) -> Result<Cell, Error> {
311    let mut exactness = Exactness::Unspecified;
312    let mut radix = 10;
313
314    while token.token_type == NumberPrefix {
315        match token.span(text) {
316            "#e" => exactness = Exactness::Exact,
317            "#i" => exactness = Exactness::Inexact,
318            "#d" => radix = 10,
319            "#b" => radix = 2,
320            "#o" => radix = 8,
321            "#x" => radix = 16,
322            _ => panic!("unexpected number prefix {}", token.span(text)),
323        }
324        token = cur.next().ok_or(Incomplete)?;
325    }
326
327    let span = token.span(text);
328    match Number::parse_with_exactness(span, exactness, radix) {
329        Some(num) => Ok(Cell::Number(num)),
330        None => Ok(Cell::Symbol(span.to_string())),
331    }
332}
333
334/// Parse Macro
335///
336/// Given a single expression, tokenize and parse the
337/// expression into a Cell.
338///
339/// This macro assumes the input is a single valid expression and
340/// will panic!() if it encounters lex or parse errors.
341///
342/// # Arguments
343/// `lhs` - The expression to parse
344#[macro_export]
345macro_rules! parse {
346    ($lhs:expr) => {{
347        let tokens = lex::scan($lhs).expect("lex failed");
348        let mut cur = tokens.iter().peekable();
349        parse::parse($lhs, &mut cur).expect("parse failed")
350    }};
351}
352
353#[cfg(test)]
354mod tests {
355    use super::*;
356    use crate::cell;
357    use crate::cons;
358    use crate::lex;
359    use crate::list;
360    use crate::vector;
361
362    macro_rules! parses {
363        ($($lhs:expr => $rhs:expr),+) => {{
364             $(
365                assert_eq!(parse($lhs, &mut lex::scan($lhs).unwrap().iter().peekable()), Ok($rhs));
366             )+
367        }};
368    }
369
370    macro_rules! fails {
371        ($($lhs:expr),+) => {{
372             $(
373                assert!(matches!(parse($lhs, &mut lex::scan($lhs).unwrap().iter().peekable()), Err(_)));
374             )+
375        }};
376    }
377
378    #[test]
379    fn paren_mismatch() {
380        fails!["(", ")"];
381    }
382
383    #[test]
384    fn quote_sugar() {
385        parses! {
386            "'1" => list!["quote", 1],
387            "'(1 2)" => list!["quote", list![1, 2]]
388        };
389    }
390
391    #[test]
392    fn variables() {
393        parses! {
394            "foo" => cell!["foo"],
395            "bar" => cell!["bar"]
396        };
397    }
398
399    #[test]
400    fn consumes_one_expression_per_call() {
401        let text = "foo bar baz";
402        let tokens = lex::scan(text).unwrap();
403        let mut cur = (&tokens).iter().peekable();
404        assert_eq!(parse(text, &mut cur), Ok(cell!["foo"]));
405        assert_eq!(parse(text, &mut cur), Ok(cell!["bar"]));
406        assert_eq!(parse(text, &mut cur), Ok(cell!["baz"]));
407        assert_eq!(parse(text, &mut cur), Err(Error::Incomplete));
408    }
409
410    #[test]
411    fn lists_are_fully_consumed() {
412        let text = "(foo bar)";
413        let tokens = lex::scan(text).unwrap();
414        let mut cur = (&tokens).iter().peekable();
415        assert_eq!(parse(text, &mut cur), Ok(list!["foo", "bar"]));
416        assert_eq!(parse(text, &mut cur), Err(Error::Incomplete));
417    }
418
419    #[test]
420    fn procedures() {
421        parses! {
422            "(foo)" => list!["foo"],
423            "( foo )" => list!["foo"],
424            "(foo bar baz)" => list!["foo", "bar", "baz"],
425            "()" => cell![],
426            "( )" => cell![]
427        };
428    }
429
430    #[test]
431    fn quasiquote() {
432        parses! {
433            "`(1 2 3)" => list!["quasiquote", list![1,2,3]],
434            ",(1 2 3)" => list!["unquote", list![1,2,3]]
435        };
436    }
437
438    #[test]
439    fn dotted_form() {
440        parses! {
441            "(0 . 2)" => cons![0, 2],
442            "(0 1 . 2)" => cons![0, cons![1, 2]],
443            "(1 2 . (3 4))" => list![1, 2, 3, 4],
444            "((1 . 2) . 3)" => cons![cons![1, 2], 3],
445            "((().()).())" => cons![cons![cell![], cell![]], cell![]]
446        };
447        fails!["(.)", "(. 0)", "(0 .)", "(0 .)", "(0 1 .)", "(1 . 2 . 3)"];
448    }
449
450    #[test]
451    fn vectors() {
452        parses! {
453            "#()" => vector![],
454            "#(1)" => vector![1],
455            "#(1 2 3)" => vector![1,2,3],
456            "#(#(1 2 3) #(4 5 6))" => vector![vector![1,2,3], vector![4,5,6]],
457            "#('foo 'bar)" => vector![list!["quote", "foo"], list!["quote", "bar"]]
458        };
459
460        fails!["#(1 2 3}", "#(1 2 3]", "#(1 2 . 3)"];
461    }
462
463    #[test]
464    fn numbers() {
465        parses! {
466            "42" => cell![42],
467            "+42" => cell![42],
468            "-42" => cell![-42],
469            "42..1" => cell!["42..1"]
470        };
471        parses! {
472            "#xff" => cell![255],
473            "#b11111111" => cell![255],
474            "#o377" => cell![255]
475        };
476        parses! {
477            "4/2" => cell![2],
478            "1.0" => cell![1]
479        }
480
481        parses! {
482            "#x7fffffff/1" => cell![0x7fffffff],
483            "#xffffffff/1" => cell![0xffffffff]
484        }
485    }
486
487    #[test]
488    fn characters() {
489        parses! {
490            "#\\c" => cell!['c'],
491            "#\\space" => cell![' '],
492            "#\\newline" => cell!['\n'],
493            "#\\x03bb" => cell!['λ'],
494            "#\\x03BB" => cell!['λ'],
495            "#\\x3bb" => cell!['λ']
496        }
497    }
498
499    #[test]
500    fn strings() {
501        parses! {
502            r#""""# => Cell::new_string(""),
503            r#""foo""# => Cell::new_string("foo"),
504            r#""foo \"bar\" baz""# => Cell::new_string("foo \"bar\" baz"),
505            r#""foo \\ baz""# => Cell::new_string("foo \\ baz"),
506            r#""\t""# => Cell::new_string("\t"),
507            r#""\n""# => Cell::new_string("\n"),
508            r#""\a""# => Cell::new_string(&String::from(char::from_u32(0x7).unwrap())),
509            r#""\a""# => Cell::new_string(&String::from(char::from_u32(0x7).unwrap())),
510            r#""\b""# => Cell::new_string(&String::from(char::from_u32(0x8).unwrap())),
511            r#""\e""# => Cell::new_string(&String::from(char::from_u32(0x1b).unwrap())),
512            r#""\x41;""# => Cell::new_string("A"),
513            r#""\x1f436;""# => Cell::new_string("🐶")
514        };
515
516        fails![r#""\xffffffff;""#, r#""\x41""#, r#""\xzz;""#];
517    }
518
519    #[test]
520    fn expressions() {
521        parses! {
522            "foo" => cell!["foo"],
523            "42" => cell![42],
524            "-18" => cell![-18],
525            "(foo)" => list!["foo"],
526            "(foo (bar baz))" => list!["foo", list!["bar", "baz"]]
527        };
528    }
529
530    #[test]
531    fn alt_paren_chars() {
532        parses! {
533            "[1 2 3]" => list![1, 2, 3],
534            "{1 2 3}" => list![1, 2, 3]
535        };
536
537        fails!["'(1 2 3]", "'(4 5 6}", "'{1 2 3]", "'[1 2 3}"];
538    }
539}