regulus 0.0.14

A simple, interpreted language with very simple syntax and zero dependencies
Documentation
// The grammar is:
// G = ({S, X}, {a, n, (, ), ","}, P, S)
// (a represents any atom, n any name / ident)
// with P:
//  S -> a | n | n() | n(X)
//  X -> S,X | S, | S

mod positions;
mod token;

use crate::no_path;
use crate::parsing::token::Token;
use crate::prelude::*;
pub use positions::{Position, Span};
pub(crate) use token::{TokenData, tokenize};

fn syntax_error<T>(msg: impl Into<String>, span: &Span) -> Result<T> {
    Err(Exception::spanned("Syntax", msg, span))
}

pub fn build_program(tokens: Vec<Token>) -> Result<Argument> {
    let mut cursor = tokens.as_slice();
    let arg = build_subprogram(&mut cursor)?;

    if let Some(t) = next_non_comment(cursor) {
        return syntax_error("trailing unparsed tokens detected", &t.span);
    }

    Ok(arg)
}

fn next_non_comment(tokens: &[Token]) -> Option<&Token> {
    tokens.iter().find(|t| !t.is_comment())
}

/// Returns all comments before the first non-comment token, then the token itself.
/// Moves `tokens` forward, right after the returned token.
fn eat_commented_token<'a>(tokens: &mut &'a [Token]) -> Result<(&'a [Token], &'a Token)> {
    for i in 0..tokens.len() {
        if !tokens[i].is_comment() {
            let r = (&tokens[0..i], &tokens[i]);
            *tokens = &tokens[i + 1..];
            return Ok(r);
        }
    }

    // this error can only be caused in two ways:
    // * when parsing an argument list comma, but then the error message is discarded
    // * by parsing a program with zero non-comment tokens, so a unhelpful span is fine
    Err(Exception::spanned(
        "Syntax",
        "program contains no non-comment tokens",
        &Span::single(Position::ONE, no_path()),
    ))
}

/// given `_(foo(), bar(baz()))`, this would take `(foo(), bar(baz()))` (with start paren, with end paren)
/// as its argument and return `foo(), bar(baz())` (no start, no end paren).
///
/// sets `tokens` to the tokens in the parens (excluding start and end parens,
/// returns the span of the end paren and the tokens after that.
fn extract_within_parens<'a>(tokens: &mut &'a [Token]) -> Result<(Span, &'a [Token])> {
    // note: tokens[0] will always be a `(`
    let mut stack = 1u32;
    for i in 1..tokens.len() {
        match tokens[i].data {
            TokenData::LeftParen => stack += 1,
            TokenData::RightParen => {
                stack -= 1;
                if stack == 0 {
                    let span = tokens[i].span.clone();
                    let rest = &tokens[i + 1..];
                    *tokens = &tokens[1..i];
                    return Ok((span, rest));
                }
            }
            _ => (),
        }
    }
    syntax_error("unclosed `(` parenthesis", &tokens[0].span)
}

/// Takes the given tokens, asserts that they are all comments and
/// returns their concatenated string representation.
fn concat_doc_comments(tokens: &[Token]) -> String {
    let mut s = String::new();
    for t in tokens {
        let TokenData::Comment(doc) = &t.data else {
            unreachable!()
        };
        s.push_str(doc.strip_prefix(' ').unwrap_or(doc));
        s.push('\n');
    }
    s.pop();
    s
}

/// returns the constructed argument
fn build_subprogram(tokens: &mut &[Token]) -> Result<Argument> {
    let (doc_comments, first_token) = eat_commented_token(tokens)?;
    if let Some(atom) = first_token.to_atom() {
        return Ok(atom);
    }
    let name = first_token.to_name()?;

    let Some(Token {
        data: TokenData::LeftParen,
        span: left_paren_span,
    }) = next_non_comment(tokens)
    else {
        return Ok(Argument::Variable(name, first_token.span.clone()));
    };

    let (right_paren_span, rest) = extract_within_parens(tokens)?;
    let mut args = vec![];

    while next_non_comment(tokens).is_some() {
        args.push(build_subprogram(tokens)?);

        let Ok((_, comma)) = eat_commented_token(tokens) else {
            break;
        };

        if !comma.is_comma() {
            return syntax_error("missing comma in argument list", &comma.span);
        }
    }
    *tokens = rest;

    Ok(Argument::FunctionCall(
        FunctionCall {
            args,
            name,
            doc_comment: concat_doc_comments(doc_comments),
        },
        Span {
            start: left_paren_span.start,
            ..right_paren_span
        },
    ))
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::no_path;

    fn make_program(code: &str) -> Result<Argument> {
        build_program(tokenize(code, no_path()).expect(""))
    }

    #[test]
    fn extra_parens() {
        let prog = make_program("_((2))");

        assert_eq!(
            prog.unwrap_err().to_string(),
            "SyntaxError: expected atom or ident\nat <file>:0:3"
        );

        let prog = make_program("(print(2)), print(3)");

        assert_eq!(
            prog.unwrap_err().to_string(),
            "SyntaxError: expected atom or ident\nat <file>:0:1"
        );
    }

    #[test]
    fn atom_fn() {
        let prog = make_program("2(4)");
        assert_eq!(prog.unwrap().stringify(), "2(4)");
    }

    #[test]
    fn two_commas() {
        let prog = make_program("_(4,,4)");
        assert_eq!(
            prog.unwrap_err().to_string(),
            "SyntaxError: expected atom or ident\nat <file>:0:5"
        );
    }

    #[test]
    fn empty_program() {
        let prog = make_program("");
        assert_eq!(
            prog.unwrap_err().to_string(),
            "SyntaxError: program contains no non-comment tokens\nat <file>:1:1"
        );
    }
}