regulus 0.0.14

A simple, interpreted language with very simple syntax and zero dependencies
Documentation
use crate::parsing::positions::CharPositions;
use crate::parsing::syntax_error;
use crate::prelude::*;
use std::num::IntErrorKind;
use std::path::PathBuf;
use std::rc::Rc;
use std::result;
use std::str::FromStr;

/// A token of source code with location information.
#[derive(Debug)]
pub(crate) struct Token {
    /// The actual token.
    pub data: TokenData,
    /// The start and end of the character range this token was created from.
    pub span: Span,
}

impl Token {
    pub fn to_atom(&self) -> Option<Argument> {
        if let TokenData::Atom(atom) = &self.data {
            Some(Argument::Atom(atom.clone(), self.span.clone()))
        } else {
            None
        }
    }

    pub fn to_name(&self) -> Result<String> {
        if let TokenData::Name(name) = &self.data {
            Ok(name.clone())
        } else {
            syntax_error("expected atom or ident", &self.span)
        }
    }

    pub const fn is_comma(&self) -> bool {
        matches!(self.data, TokenData::Comma)
    }

    pub const fn is_comment(&self) -> bool {
        matches!(self.data, TokenData::Comment(_))
    }
}

#[derive(Debug)]
pub enum TokenData {
    LeftParen,
    Comma,
    RightParen,
    Atom(Atom),
    Name(String),
    Comment(String),
}

/// Takes characters from the stream until `target` is reached.
/// Returns all characters before `target` and the index of `target`.
/// Returns `Err(all_consumed_chars)` if `target` was never found.
fn take_until(
    chars: impl Iterator<Item = (Position, char)>,
    target: char,
) -> result::Result<(Position, String), String> {
    let mut result = String::new();
    for (pos, c) in chars {
        if c == target {
            return Ok((pos, result));
        }
        result.push(c);
    }
    Err(result)
}

const CAP: usize = 10;

pub fn tokenize(code: &str, file_path: Rc<PathBuf>) -> Result<Vec<Token>> {
    let mut tokens = vec![];

    let mut current = String::with_capacity(CAP);

    let mut chars = CharPositions::new(code);
    let mut add_token = |data, start, end| {
        tokens.push(Token {
            span: Span::new(start, end, file_path.clone()),
            data,
        });
    };

    let mut current_start_pos = None;

    while let Some((char_pos, c)) = chars.next() {
        let syntax_error = |msg| {
            Err(Exception::spanned(
                "Syntax",
                msg,
                &Span::single(char_pos, file_path.clone()),
            ))
        };
        match c {
            '(' => {
                if !current.is_empty() {
                    add_token(
                        TokenData::Name(current),
                        current_start_pos.take().unwrap(),
                        char_pos.one_back(),
                    );
                    current = String::with_capacity(CAP);
                }
                add_token(TokenData::LeftParen, char_pos, char_pos);
            }
            ')' | ',' | ' ' | '\n' | '\t' => {
                if !current.is_empty() {
                    add_token(
                        try_parse_atom(current, char_pos, &file_path)?,
                        current_start_pos.take().unwrap(),
                        char_pos.one_back(),
                    );
                    current = String::with_capacity(CAP);
                }
                add_token(
                    match c {
                        ')' => TokenData::RightParen,
                        ',' => TokenData::Comma,
                        _ => continue,
                    },
                    char_pos,
                    char_pos,
                );
            }
            '"' => {
                let Ok((end_pos, body)) = take_until(chars.by_ref(), '"') else {
                    return syntax_error("unclosed string literal");
                };
                add_token(TokenData::Atom(Atom::new_string(&body)), char_pos, end_pos);
            }
            '\'' => {
                let Ok((end_pos, body)) = take_until(chars.by_ref(), '\'') else {
                    return syntax_error("unclosed char literal");
                };
                match char::from_str(&body) {
                    Ok(c) => add_token(TokenData::Atom(Atom::Char(c)), char_pos, end_pos),
                    Err(e) => return syntax_error(&format!("invalid char literal: {e}")),
                }
            }
            '#' => {
                let (end_pos, body) =
                    take_until(chars.by_ref(), '\n').unwrap_or_else(|body| (last_pos(code), body));
                add_token(TokenData::Comment(body), char_pos, end_pos);
            }
            _ => {
                if current_start_pos.is_none() {
                    current_start_pos = Some(char_pos);
                }
                current.push(c);
            }
        }
    }

    if !current.is_empty() {
        let p = last_pos(code);
        add_token(
            try_parse_atom(current, p, &file_path)?,
            current_start_pos.unwrap(),
            p,
        );
    }

    Ok(tokens)
}

fn try_parse_atom(s: String, pos: Position, file_path: &Rc<PathBuf>) -> Result<TokenData> {
    match s.as_str() {
        "true" => Ok(TokenData::Atom(Atom::Bool(true))),
        "false" => Ok(TokenData::Atom(Atom::Bool(false))),
        "null" => Ok(TokenData::Atom(Atom::Null)),
        _ => match s.parse::<i64>() {
            Ok(int) => Ok(TokenData::Atom(Atom::Int(int))),
            Err(err) => match err.kind() {
                IntErrorKind::PosOverflow | IntErrorKind::NegOverflow => Err(Exception::spanned(
                    "Syntax",
                    format!("overflowing integer literal: {s}"),
                    &Span::single(pos, file_path.clone()),
                )),
                _ => Ok(TokenData::Name(s)),
            },
        },
    }
}

fn last_pos(code: &str) -> Position {
    CharPositions::new(code)
        .last()
        .expect("already found some code")
        .0
}

/// Returns all characters of the text that the given span encloses.
/// Returns `None` if the span is invalid (end before start or out of bounds).
// TODO: use or remove
#[cfg_attr(not(test), expect(dead_code))]
pub fn extract(text: &str, span: &Span) -> Option<String> {
    let mut start_found = false;

    let mut s = String::new();
    for (pos, c) in CharPositions::new(text) {
        if pos == span.start {
            start_found = true;
        }
        if start_found {
            s.push(c);
        }

        if pos == span.end {
            if !start_found {
                return None;
            }
            return Some(s);
        }
    }
    None
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::no_path;

    fn sp(start_line: u32, start_col: u32, end_line: u32, end_col: u32) -> Span {
        Span::new(
            Position::new(start_line - 1, start_col),
            Position::new(end_line - 1, end_col),
            no_path(),
        )
    }

    #[expect(clippy::unnecessary_wraps)]
    fn so(text: &str) -> Option<String> {
        Some(text.to_string())
    }

    #[test]
    fn extract_1() {
        let t = "abc\nde\nf\nghi\n";
        assert_eq!(extract(t, &sp(1, 1, 1, 4)), so("abc\n"));
        assert_eq!(extract(t, &sp(1, 1, 2, 2)), so("abc\nde"));
        assert_eq!(extract(t, &sp(1, 1, 2, 1)), so("abc\nd"));
        assert_eq!(extract(t, &sp(1, 3, 2, 2)), so("c\nde"));
        assert_eq!(extract(t, &sp(1, 1, 1, 2)), so("ab"));
        assert_eq!(extract(t, &sp(1, 1, 1, 1)), so("a"));
        assert_eq!(extract(t, &sp(1, 1, 1, 1000)), None);
        assert_eq!(extract(t, &sp(1, 2, 1, 1)), None);
        assert_eq!(extract(t, &sp(2, 2, 3, 2)), so("e\nf\n"));
        assert_eq!(extract(t, &sp(3, 1, 1, 1)), None);
        assert_eq!(extract(t, &sp(2, 1, 1, 4)), None);
        assert_eq!(extract(t, &sp(2, 2, 2, 2)), so("e"));
        assert_eq!(extract(t, &sp(3, 2, 3, 2)), so("\n"));
        assert_eq!(extract(t, &sp(4, 5, 4, 5)), None);
        assert_eq!(extract(t, &sp(4, 5, 4, 4)), None);
        assert_eq!(extract(t, &sp(3, 2, 4, 1)), so("\ng"));
        assert_eq!(extract(t, &sp(3, 2, 6, 1)), None);
        assert_eq!(extract(t, &sp(3, 3, 4, 1)), None);
    }

    #[test]
    fn token_extraction() {
        let code = "_(
	def(double_and_print, x, print(*(2, x))),
)
";
        let tokens = tokenize(code, no_path()).unwrap();

        let parts = tokens
            .into_iter()
            .map(|t| extract(code, &t.span).unwrap())
            .collect::<Vec<_>>();

        assert_eq!(
            parts,
            [
                "_",
                "(",
                "def",
                "(",
                "double_and_print",
                ",",
                "x",
                ",",
                "print",
                "(",
                "*",
                "(",
                "2",
                ",",
                "x",
                ")",
                ")",
                ")",
                ",",
                ")"
            ]
            .map(ToString::to_string)
        );

        assert_eq!(parts.join(""), code.replace(['\n', '\t', ' '], ""));
    }
}