fnotation 0.11.2

A simple lower-house syntax for programming language experimentation
Documentation
use crate::{token::*, ParseConfig};
use std::{iter::Peekable, str::Chars};
use tattle::{declare_error, Loc, Reporter};

const OPERATOR_CHARS: &'static [char] =
    &['+', '*', '-', '/', '<', '>', '&', '|', '!', '=', ':', ''];

declare_error!(LEX_ERROR, "lex", "an error during the lexing phase");

struct Lexer<'a> {
    iter: Peekable<Chars<'a>>,
    src: &'a str,
    reporter: Reporter,
    parse_config: &'a ParseConfig<'a>,
    out: Vec<Token>,
    preceding_whitespace: bool,
    prev: usize,
    cur: usize,
}

macro_rules! error {
    ($m:expr, $msg:literal) => {{
        $m.error(format!($msg));
    }};
    ($m:expr, $msg:literal, $($arg:expr),+) => {{
        $m.error(format!($msg, $($arg),+));
    }};
}

impl<'a> Lexer<'a> {
    pub fn new(source: &'a str, parse_config: &'a ParseConfig, reporter: Reporter) -> Self {
        Lexer {
            iter: source.chars().peekable(),
            src: source,
            reporter,
            parse_config,
            out: Vec::new(),
            preceding_whitespace: false,
            prev: 0,
            cur: 0,
        }
    }

    fn error(&mut self, message: String) {
        let l = Loc::new(self.prev, self.cur);
        self.emit(ERROR);
        self.reporter.error(l, LEX_ERROR, message.into());
    }

    fn peek(&mut self) -> Option<char> {
        self.iter.peek().copied()
    }

    fn advance(&mut self) -> Option<char> {
        self.iter.next().map(|c| {
            self.cur += c.len_utf8();
            c
        })
    }

    fn emit(&mut self, kind: Kind) {
        self.out.push(Token::new(
            self.preceding_whitespace,
            kind,
            Loc::new(self.prev, self.cur),
        ));
        self.preceding_whitespace = false;
        self.prev = self.cur;
    }

    fn skip(&mut self) {
        self.prev = self.cur;
    }

    fn slice(&self) -> &'a str {
        &self.src[self.prev..self.cur]
    }

    fn many<F: Fn(char) -> bool>(&mut self, f: F) {
        while let Some(c) = self.peek() {
            if f(c) {
                self.advance();
            } else {
                break;
            }
        }
    }

    fn some<F: Fn(char) -> bool>(&mut self, f: F, desc: &str) {
        if let Some(c) = self.peek() {
            if f(c) {
                self.many(f);
                return;
            }
        }
        error!(self, "expected a character satisfying {desc}");
    }
}

fn word(l: &mut Lexer) {
    l.some(|c| c.is_alphanumeric() || c == '_', "alphanumeric or '_'");
}

fn word0(l: &mut Lexer) {
    l.many(|c| c.is_alphanumeric() || c == '_');
}

fn keyword(l: &mut Lexer, kind: Kind) {
    word(l);
    l.emit(kind);
}

fn op(l: &mut Lexer, as_var: bool) {
    l.many(|c| OPERATOR_CHARS.contains(&c));
    if l.parse_config.is_keyword(l.slice()) {
        l.emit(if as_var { KEYWORD } else { KEYWORD_OP });
    } else {
        l.emit(if as_var { VAR } else { OP });
    }
}

fn num(l: &mut Lexer) {
    l.many(|c| c.is_digit(10));
    if l.peek() == Some('.') {
        l.advance();
        l.many(|c| c.is_digit(10));
        l.emit(FLOAT);
    } else {
        l.emit(INT);
    }
}

fn string(l: &mut Lexer) {
    l.many(|c| c != '"');
    match l.advance() {
        Some(_) => {
            l.emit(STRING);
        }
        None => {
            error!(l, "expected closing quote for string")
        }
    }
}

#[derive(Debug)]
pub enum LexFatalError {
    ASCIIControl,
}

fn run(l: &mut Lexer) -> Result<(), LexFatalError> {
    l.emit(BOF);
    while let Some(c) = l.advance() {
        match c {
            _ if c.is_whitespace() => {
                l.skip();
                l.preceding_whitespace = true;
                continue;
            }
            _ if c.is_alphabetic() || c == '_' => {
                word0(l);
                if l.parse_config.is_keyword(l.slice()) {
                    l.emit(KEYWORD);
                } else if l.parse_config.is_toplevel(l.slice()) {
                    l.emit(TOPDECL)
                } else {
                    l.emit(VAR);
                }
            }
            _ if c.is_digit(10) => num(l),
            _ if OPERATOR_CHARS.contains(&c) => op(l, false),
            '#' => {
                if let Some(c) = l.advance() {
                    if c == '(' {
                        l.emit(ANNOT);
                    } else if c == '/' {
                        while let Some(c) = l.advance() {
                            if c == '\n' {
                                l.skip();
                                break;
                            }
                        }
                    } else {
                        l.error(format!("unknown command letter '{c}'"))
                    }
                }
            }
            '`' => {
                l.skip();
                op(l, true)
            }
            '@' => keyword(l, PRIM),
            '%' => keyword(l, SPECIAL),
            '.' => keyword(l, FIELD),
            '\'' => keyword(l, TAG),
            '"' => string(l),
            ';' => l.emit(SEMICOLON),
            ',' => l.emit(COMMA),
            '{' => l.emit(LCURLY),
            '}' => l.emit(RCURLY),
            '(' => l.emit(LPAREN),
            ')' => l.emit(RPAREN),
            '[' => l.emit(LBRACK),
            ']' => l.emit(RBRACK),
            _ => {
                if c.is_ascii_control() {
                    error!(l, "ascii control character '{c}' not allowed");
                    return Err(LexFatalError::ASCIIControl);
                }
                error!(l, "unexpected character '{c}'");
            }
        }
    }
    Ok(())
}

pub fn lex(
    source: &str,
    parse_config: &ParseConfig,
    reporter: Reporter,
) -> Result<Vec<Token>, LexFatalError> {
    let mut lexer = Lexer::new(source, parse_config, reporter);
    run(&mut lexer)?;
    Ok(lexer.out)
}

#[cfg(test)]
mod test {
    use expect_test::{expect, Expect};
    use indoc::indoc;

    use crate::{test_util::AssertEqStripped, ParseConfig};

    use super::lex;

    use std::fmt::Write;
    use tattle::{display::SourceInfo, Reporter};

    const DEMO_PARSECONFIG: ParseConfig = ParseConfig::new(&[], &[], &["model"]);

    fn test(input: &str, expected: Expect) {
        let reporter = Reporter::new();
        let tokens = lex(&input, &DEMO_PARSECONFIG, reporter.clone()).unwrap();
        let mut out = String::new();
        for tok in tokens.iter() {
            write!(&mut out, "{} ", tok).unwrap();
        }
        reporter.info(out);
        expected
            .assert_eq_stripped(&SourceInfo::new(None, input).extract_report_to_string(reporter));
    }

    #[test]
    fn lexer_tests() {
        test(
            "E",
            expect![[r#"
                info: BOF:0-0 VAR:0-1
            "#]],
        );
        test(
            "A",
            expect![[r#"
                info: BOF:0-0 VAR:0-1
            "#]],
        );
        test(
            indoc! {r#"
                model B
                #@error
                model A
            "#},
            expect![[r#"
                error[lex]: unknown command letter '@'
                --> <none>:2:1
                2| #@error
                2| ^^
                info: BOF:0-0 TOPDECL:0-5 VAR:6-7 ERROR:8-10 VAR:10-15 TOPDECL:16-21 VAR:22-23
            "#]],
        );
    }
}