use std::ops::Range;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum TokenKind {
Keyword,
Builtin,
DefMarker,
DefEnd,
Integer,
Float,
Boolean,
String,
Comment,
TypeName,
StackEffect,
Quotation,
Include,
ModulePath,
Identifier,
Whitespace,
Unknown,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Token {
pub kind: TokenKind,
pub span: Range<usize>,
pub text: String,
}
impl Token {
fn new(kind: TokenKind, start: usize, end: usize, text: impl Into<String>) -> Self {
Self {
kind,
span: start..end,
text: text.into(),
}
}
}
const KEYWORDS: &[&str] = &[
"if", "else", "loop", "break", "match", "return", "yield", "spawn", "send", "recv", "select",
];
const BUILTINS: &[&str] = &[
"dup",
"drop",
"swap",
"over",
"rot",
"nip",
"tuck",
"pick",
"roll",
"i.add",
"i.subtract",
"i.multiply",
"i.divide",
"modulo",
"negate",
"equals",
"not-equals",
"less-than",
"greater-than",
"less-or-equal",
"greater-or-equal",
"and",
"or",
"not",
"apply",
"dip",
"keep",
"bi",
"tri",
"print",
"println",
"debug",
"none",
"some",
"ok",
"err",
];
const TYPE_NAMES: &[&str] = &[
"Int", "Float", "Bool", "String", "Char", "Unit", "Option", "Result", "Channel", "Strand",
];
pub fn tokenize(source: &str) -> Vec<Token> {
let mut tokens = Vec::new();
let chars: Vec<char> = source.chars().collect();
let mut pos = 0;
while pos < chars.len() {
let start = pos;
let ch = chars[pos];
if ch.is_whitespace() {
while pos < chars.len() && chars[pos].is_whitespace() {
pos += 1;
}
let text: String = chars[start..pos].iter().collect();
tokens.push(Token::new(TokenKind::Whitespace, start, pos, text));
continue;
}
if ch == '#' {
while pos < chars.len() && chars[pos] != '\n' {
pos += 1;
}
let text: String = chars[start..pos].iter().collect();
tokens.push(Token::new(TokenKind::Comment, start, pos, text));
continue;
}
if ch == '"' {
pos += 1; while pos < chars.len() && chars[pos] != '"' {
if chars[pos] == '\\' && pos + 1 < chars.len() {
pos += 2; } else {
pos += 1;
}
}
if pos < chars.len() {
pos += 1; }
let text: String = chars[start..pos].iter().collect();
tokens.push(Token::new(TokenKind::String, start, pos, text));
continue;
}
if ch == '[' || ch == ']' {
pos += 1;
tokens.push(Token::new(TokenKind::Quotation, start, pos, ch.to_string()));
continue;
}
if ch == '(' || ch == ')' {
pos += 1;
tokens.push(Token::new(
TokenKind::StackEffect,
start,
pos,
ch.to_string(),
));
continue;
}
if ch == ':' {
pos += 1;
if start > 0 && !chars[start - 1].is_whitespace() {
let text: String = chars[start..pos].iter().collect();
tokens.push(Token::new(TokenKind::ModulePath, start, pos, text));
} else {
tokens.push(Token::new(TokenKind::DefMarker, start, pos, ":"));
}
continue;
}
if ch == ';' {
pos += 1;
tokens.push(Token::new(TokenKind::DefEnd, start, pos, ";"));
continue;
}
if ch == '-' && pos + 1 < chars.len() && chars[pos + 1] == '-' {
pos += 2;
tokens.push(Token::new(TokenKind::StackEffect, start, pos, "--"));
continue;
}
if ch.is_ascii_digit()
|| (ch == '-' && pos + 1 < chars.len() && chars[pos + 1].is_ascii_digit())
{
let is_negative = ch == '-';
if is_negative {
pos += 1;
}
while pos < chars.len() && chars[pos].is_ascii_digit() {
pos += 1;
}
if pos < chars.len()
&& chars[pos] == '.'
&& pos + 1 < chars.len()
&& chars[pos + 1].is_ascii_digit()
{
pos += 1; while pos < chars.len() && chars[pos].is_ascii_digit() {
pos += 1;
}
let text: String = chars[start..pos].iter().collect();
tokens.push(Token::new(TokenKind::Float, start, pos, text));
} else {
let text: String = chars[start..pos].iter().collect();
tokens.push(Token::new(TokenKind::Integer, start, pos, text));
}
continue;
}
if ch.is_alphabetic() || ch == '_' || ch == '-' {
while pos < chars.len() {
let c = chars[pos];
if c.is_alphanumeric() || c == '_' || c == '-' || c == ':' || c == '.' {
pos += 1;
} else {
break;
}
}
let text: String = chars[start..pos].iter().collect();
let kind = classify_identifier(&text);
tokens.push(Token::new(kind, start, pos, text));
continue;
}
if matches!(ch, '+' | '*' | '/' | '%' | '=' | '<' | '>') {
pos += 1;
if pos < chars.len() {
let next = chars[pos];
if (ch == '<' && (next == '=' || next == '>')) || (ch == '>' && next == '=') {
pos += 1;
}
}
let text: String = chars[start..pos].iter().collect();
tokens.push(Token::new(TokenKind::Builtin, start, pos, text));
continue;
}
pos += 1;
tokens.push(Token::new(TokenKind::Unknown, start, pos, ch.to_string()));
}
tokens
}
fn classify_identifier(text: &str) -> TokenKind {
if text == "true" || text == "false" {
return TokenKind::Boolean;
}
if text == "include" {
return TokenKind::Include;
}
if KEYWORDS.contains(&text) {
return TokenKind::Keyword;
}
if BUILTINS.contains(&text) {
return TokenKind::Builtin;
}
if TYPE_NAMES.contains(&text) {
return TokenKind::TypeName;
}
if text.contains(':') {
return TokenKind::ModulePath;
}
TokenKind::Identifier
}
#[allow(dead_code)]
pub fn tokenize_visible(source: &str) -> Vec<Token> {
tokenize(source)
.into_iter()
.filter(|t| t.kind != TokenKind::Whitespace)
.collect()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_tokenize_numbers() {
let tokens = tokenize_visible("42 3.14 -5");
assert_eq!(tokens.len(), 3);
assert_eq!(tokens[0].kind, TokenKind::Integer);
assert_eq!(tokens[0].text, "42");
assert_eq!(tokens[1].kind, TokenKind::Float);
assert_eq!(tokens[1].text, "3.14");
assert_eq!(tokens[2].kind, TokenKind::Integer);
assert_eq!(tokens[2].text, "-5");
}
#[test]
fn test_tokenize_string() {
let tokens = tokenize_visible("\"hello world\"");
assert_eq!(tokens.len(), 1);
assert_eq!(tokens[0].kind, TokenKind::String);
assert_eq!(tokens[0].text, "\"hello world\"");
}
#[test]
fn test_tokenize_comment() {
let tokens = tokenize_visible("42 # this is a comment");
assert_eq!(tokens.len(), 2);
assert_eq!(tokens[0].kind, TokenKind::Integer);
assert_eq!(tokens[1].kind, TokenKind::Comment);
}
#[test]
fn test_tokenize_definition() {
let tokens = tokenize_visible(": square dup i.multiply ;");
assert_eq!(tokens.len(), 5);
assert_eq!(tokens[0].kind, TokenKind::DefMarker);
assert_eq!(tokens[1].kind, TokenKind::Identifier); assert_eq!(tokens[2].kind, TokenKind::Builtin);
assert_eq!(tokens[3].kind, TokenKind::Builtin);
assert_eq!(tokens[4].kind, TokenKind::DefEnd);
}
#[test]
fn test_tokenize_keywords() {
let tokens = tokenize_visible("if else loop break");
assert!(tokens.iter().all(|t| t.kind == TokenKind::Keyword));
}
#[test]
fn test_tokenize_builtins() {
let tokens = tokenize_visible("dup drop swap over");
assert!(tokens.iter().all(|t| t.kind == TokenKind::Builtin));
}
#[test]
fn test_tokenize_booleans() {
let tokens = tokenize_visible("true false");
assert!(tokens.iter().all(|t| t.kind == TokenKind::Boolean));
}
#[test]
fn test_tokenize_stack_effect() {
let tokens = tokenize_visible("( Int Int -- Int )");
assert_eq!(tokens[0].kind, TokenKind::StackEffect); assert_eq!(tokens[1].kind, TokenKind::TypeName); assert_eq!(tokens[2].kind, TokenKind::TypeName); assert_eq!(tokens[3].kind, TokenKind::StackEffect); assert_eq!(tokens[4].kind, TokenKind::TypeName); assert_eq!(tokens[5].kind, TokenKind::StackEffect); }
#[test]
fn test_tokenize_quotation() {
let tokens = tokenize_visible("[ dup i.multiply ]");
assert_eq!(tokens[0].kind, TokenKind::Quotation);
assert_eq!(tokens[1].kind, TokenKind::Builtin);
assert_eq!(tokens[2].kind, TokenKind::Builtin);
assert_eq!(tokens[3].kind, TokenKind::Quotation);
}
#[test]
fn test_tokenize_include() {
let tokens = tokenize_visible("include std:imath");
assert_eq!(tokens[0].kind, TokenKind::Include);
assert_eq!(tokens[1].kind, TokenKind::ModulePath);
}
#[test]
fn test_span_positions() {
let source = "42 dup";
let tokens = tokenize(source);
assert_eq!(tokens[0].span, 0..2);
assert_eq!(tokens[1].span, 2..3);
assert_eq!(tokens[2].span, 3..6);
}
#[test]
fn test_escaped_string() {
let tokens = tokenize_visible(r#""hello \"world\"""#);
assert_eq!(tokens.len(), 1);
assert_eq!(tokens[0].kind, TokenKind::String);
}
}