use logos::Logos;
use crate::error::CompileError;
use crate::span::Span;
#[derive(Logos, Debug, Clone, Copy, PartialEq, Eq)]
#[logos(skip r"[ \t\r\n]+")]
pub enum TokenKind {
#[token("commons")]
Commons,
#[token("type")]
Type,
#[token("fn")]
Fn,
#[token("where")]
Where,
#[token("and")]
And,
#[token("true")]
True,
#[token("false")]
False,
#[token("Int")]
Int,
#[token("String")]
String,
#[token("Bool")]
Bool,
#[token("Float")]
Float,
#[token("Duration")]
Duration,
#[token("Instant")]
Instant,
#[token("let")]
Let,
#[token("if")]
If,
#[token("else")]
Else,
#[token("Ok")]
Ok,
#[token("Err")]
Err,
#[token("Result")]
Result,
#[token("ValidationError")]
ValidationError,
#[token("JsonError")]
JsonError,
#[token("enum")]
Enum,
#[token("match")]
Match,
#[token("Option")]
Option,
#[token("record")]
Record,
#[token("self")]
Self_,
#[token("Some")]
Some,
#[token("None")]
None,
#[token("is")]
Is,
#[token("opaque")]
Opaque,
#[token("uses")]
Uses,
#[token("context")]
Context,
#[token("consumes")]
Consumes,
#[token("exports")]
Exports,
#[token("transparent")]
Transparent,
#[token("as")]
As,
#[token("assert")]
Assert,
#[token("expect")]
Expect,
#[token("mocks")]
Mocks,
#[token("test")]
Test,
#[token("wires")]
Wires,
#[token("adapter")]
Adapter,
#[token("binding")]
Binding,
#[token("agent")]
Agent,
#[token("capability")]
Capability,
#[token("Effect")]
Effect,
#[token("given")]
Given,
#[token("on")]
On,
#[token("http")]
Http,
#[token("cron")]
Cron,
#[token("queue")]
Queue,
#[token("from")]
From,
#[token("protocol")]
Protocol,
#[token("provides")]
Provides,
#[token("service")]
Service,
#[token("actor")]
Actor,
#[token("by")]
By,
#[token("invariant")]
Invariant,
#[token("implies")]
Implies,
#[token("...")]
DotDotDot,
#[token("<-")]
LArrow,
#[token("~>")]
TildeArrow,
#[token(":=")]
ColonEq,
DocBlock,
Comment,
#[regex(r"[A-Za-z][A-Za-z0-9_]*")]
Ident,
#[regex(r"[0-9]+")]
IntLit,
#[regex(r"[0-9]+\.[0-9]+([eE][+-]?[0-9]+)?|[0-9]+[eE][+-]?[0-9]+")]
FloatLit,
#[regex(r#""([^"\\\n]|\\[nt"\\])*""#)]
StrLit,
InterpStr,
#[token("->")]
Arrow,
#[token("==")]
EqEq,
#[token("!=")]
BangEq,
#[token("<=")]
LtEq,
#[token(">=")]
GtEq,
#[token("&&")]
AmpAmp,
#[token("||")]
PipePipe,
#[token("+")]
Plus,
#[token("-")]
Minus,
#[token("*")]
Star,
#[token("/")]
Slash,
#[token("!")]
Bang,
#[token("=")]
Eq,
#[token("<")]
Lt,
#[token(">")]
Gt,
#[token("?")]
Question,
#[token("=>")]
FatArrow,
#[token("_")]
Underscore,
#[token("|")]
Pipe,
#[token("@")]
At,
#[token("(")]
LParen,
#[token(")")]
RParen,
#[token("{")]
LBrace,
#[token("}")]
RBrace,
#[token("[")]
LBracket,
#[token("]")]
RBracket,
#[token(",")]
Comma,
#[token(":")]
Colon,
#[token(".")]
Dot,
}
impl TokenKind {
pub fn describe(self) -> &'static str {
use TokenKind::*;
match self {
Commons => "`commons`",
Type => "`type`",
Fn => "`fn`",
Where => "`where`",
And => "`and`",
True => "`true`",
False => "`false`",
Int => "`Int`",
String => "`String`",
Bool => "`Bool`",
Float => "`Float`",
Duration => "`Duration`",
Instant => "`Instant`",
Let => "`let`",
If => "`if`",
Else => "`else`",
Ok => "`Ok`",
Err => "`Err`",
Result => "`Result`",
ValidationError => "`ValidationError`",
JsonError => "`JsonError`",
Enum => "`enum`",
Match => "`match`",
Option => "`Option`",
Record => "`record`",
Self_ => "`self`",
Some => "`Some`",
None => "`None`",
Is => "`is`",
Opaque => "`opaque`",
Uses => "`uses`",
Context => "`context`",
Consumes => "`consumes`",
Exports => "`exports`",
Transparent => "`transparent`",
As => "`as`",
Assert => "`assert`",
Expect => "`expect`",
Mocks => "`mocks`",
Test => "`test`",
Wires => "`wires`",
Adapter => "`adapter`",
Binding => "`binding`",
Agent => "`agent`",
Capability => "`capability`",
Effect => "`Effect`",
Given => "`given`",
On => "`on`",
Http => "`http`",
Cron => "`cron`",
Queue => "`queue`",
From => "`from`",
Protocol => "`protocol`",
Provides => "`provides`",
Service => "`service`",
Actor => "`actor`",
By => "`by`",
Invariant => "`invariant`",
Implies => "`implies`",
ColonEq => "`:=`",
DotDotDot => "`...`",
LArrow => "`<-`",
TildeArrow => "`~>`",
DocBlock => "documentation block",
Comment => "line comment",
Ident => "identifier",
IntLit => "integer literal",
FloatLit => "float literal",
StrLit => "string literal",
InterpStr => "interpolated string",
Arrow => "`->`",
EqEq => "`==`",
BangEq => "`!=`",
LtEq => "`<=`",
GtEq => "`>=`",
AmpAmp => "`&&`",
PipePipe => "`||`",
Plus => "`+`",
Minus => "`-`",
Star => "`*`",
Slash => "`/`",
Bang => "`!`",
Eq => "`=`",
Lt => "`<`",
Gt => "`>`",
Question => "`?`",
FatArrow => "`=>`",
Underscore => "`_`",
Pipe => "`|`",
At => "`@`",
LParen => "`(`",
RParen => "`)`",
LBrace => "`{`",
RBrace => "`}`",
LBracket => "`[`",
RBracket => "`]`",
Comma => "`,`",
Colon => "`:`",
Dot => "`.`",
}
}
}
#[derive(Debug, Clone, Copy)]
pub struct Token {
pub kind: TokenKind,
pub span: Span,
}
pub fn tokenize(source: &str) -> Result<Vec<Token>, CompileError> {
let mut tokens = Vec::new();
let bytes = source.as_bytes();
let mut pos = 0;
while pos < bytes.len() {
if let Some(open_end) = doc_block_open_at(source, pos) {
match doc_block_close(source, open_end) {
Some((close_start, close_end)) => {
let span = Span::new(pos, close_end);
tokens.push(Token {
kind: TokenKind::DocBlock,
span,
});
let _ = close_start;
pos = close_end;
continue;
}
None => {
return Err(CompileError::new(
"bynk.lex.unclosed_doc_block",
Span::new(pos, open_end),
"documentation block opened but never closed",
)
.with_note(
"a doc block must be terminated by another `---` on a line by itself",
));
}
}
}
if pos + 1 < bytes.len() && bytes[pos] == b'-' && bytes[pos + 1] == b'-' {
let start = pos;
while pos < bytes.len() && bytes[pos] != b'\n' {
pos += 1;
}
tokens.push(Token {
kind: TokenKind::Comment,
span: Span::new(start, pos),
});
continue;
}
if matches!(bytes[pos], b' ' | b'\t' | b'\r' | b'\n') {
pos += 1;
continue;
}
if bytes[pos] == b'"' && has_interp_hole(bytes, pos) {
let end = scan_str(bytes, source, pos)?;
tokens.push(Token {
kind: TokenKind::InterpStr,
span: Span::new(pos, end),
});
pos = end;
continue;
}
let mut lex = TokenKind::lexer(&source[pos..]);
let Some(result) = lex.next() else {
let ch = source[pos..].chars().next().unwrap_or('\0');
let span = Span::new(pos, pos + ch.len_utf8());
return Err(CompileError::new(
"bynk.lex.unexpected_character",
span,
format!("unexpected character `{ch}`"),
));
};
let local = lex.span();
let span: Span = Span::new(pos + local.start, pos + local.end);
match result {
Ok(kind) => {
if kind == TokenKind::IntLit {
let slice = &source[span.range()];
if slice.parse::<i64>().is_err() {
return Err(CompileError::new(
"bynk.lex.integer_overflow",
span,
format!(
"integer literal `{slice}` is out of range for a 64-bit signed integer"
),
)
.with_note("the range is -2^63 to 2^63 - 1"));
}
}
if kind == TokenKind::FloatLit {
let slice = &source[span.range()];
match slice.parse::<f64>() {
Ok(v) if v.is_finite() => {}
_ => {
return Err(CompileError::new(
"bynk.lex.float_literal_overflow",
span,
format!(
"float literal `{slice}` is out of range for a 64-bit float"
),
)
.with_note(
"the literal does not fit a finite IEEE 754 double; \
the largest finite value is ~1.8e308",
));
}
}
}
tokens.push(Token { kind, span });
pos = span.end;
}
Err(()) => {
let slice = &source[span.range()];
let ch = slice.chars().next().unwrap_or('\0');
let err = if ch == '"' {
CompileError::new(
"bynk.lex.unterminated_string",
span,
"unterminated string literal",
)
.with_note(
"string literals must close with `\"` on the same line; \
supported escapes are `\\n`, `\\t`, `\\\"`, `\\\\`",
)
} else {
CompileError::new(
"bynk.lex.unexpected_character",
span,
format!("unexpected character `{ch}`"),
)
};
return Err(err);
}
}
}
Ok(tokens)
}
fn has_interp_hole(bytes: &[u8], start: usize) -> bool {
let mut i = start + 1;
while i < bytes.len() {
match bytes[i] {
b'\n' | b'"' => return false,
b'\\' => {
if bytes.get(i + 1) == Some(&b'(') {
return true;
}
i += 2;
}
_ => i += 1,
}
}
false
}
fn scan_str(bytes: &[u8], source: &str, start: usize) -> Result<usize, CompileError> {
debug_assert_eq!(bytes[start], b'"');
let mut i = start + 1;
loop {
if i >= bytes.len() || bytes[i] == b'\n' {
return Err(CompileError::new(
"bynk.lex.unterminated_string",
Span::new(start, i.min(bytes.len())),
"unterminated string literal",
)
.with_note(
"string literals must close with `\"` on the same line; \
supported escapes are `\\n`, `\\t`, `\\\"`, `\\\\`, and `\\(…)` interpolation",
));
}
match bytes[i] {
b'"' => return Ok(i + 1),
b'\\' => match bytes.get(i + 1) {
Some(b'n' | b't' | b'"' | b'\\') => i += 2,
Some(b'(') => i = scan_hole(bytes, source, i + 2)?,
other => {
let shown = other.map(|b| (*b as char).to_string()).unwrap_or_default();
return Err(CompileError::new(
"bynk.lex.bad_escape",
Span::new(i, (i + 2).min(bytes.len())),
format!("invalid escape sequence `\\{shown}` in string literal"),
)
.with_note("supported escapes: \\n \\t \\\" \\\\ \\(…)"));
}
},
_ => i += 1,
}
}
}
fn scan_hole(bytes: &[u8], source: &str, start: usize) -> Result<usize, CompileError> {
let mut i = start;
let mut depth = 1usize;
loop {
if i >= bytes.len() || bytes[i] == b'\n' {
return Err(CompileError::new(
"bynk.lex.unterminated_interpolation",
Span::new(start.saturating_sub(2), i.min(bytes.len())),
"unterminated interpolation hole",
)
.with_note(
"an interpolation hole `\\(…)` must close with a matching `)` on the same line",
));
}
match bytes[i] {
b'(' => {
depth += 1;
i += 1;
}
b')' => {
depth -= 1;
i += 1;
if depth == 0 {
return Ok(i);
}
}
b'"' => i = scan_str(bytes, source, i)?,
_ => i += 1,
}
}
}
pub(crate) enum InterpSegment {
Chunk(String),
Hole(Span),
}
pub(crate) fn split_interp(source: &str, span: Span) -> Result<Vec<InterpSegment>, CompileError> {
let bytes = source.as_bytes();
let inner_end = span.end - 1; let mut segments = Vec::new();
let mut chunk = String::new();
let mut i = span.start + 1; while i < inner_end {
match bytes[i] {
b'\\' => match bytes[i + 1] {
b'n' => {
chunk.push('\n');
i += 2;
}
b't' => {
chunk.push('\t');
i += 2;
}
b'"' => {
chunk.push('"');
i += 2;
}
b'\\' => {
chunk.push('\\');
i += 2;
}
b'(' => {
if !chunk.is_empty() {
segments.push(InterpSegment::Chunk(std::mem::take(&mut chunk)));
}
let hole_start = i + 2;
let after = scan_hole(bytes, source, hole_start)?;
segments.push(InterpSegment::Hole(Span::new(hole_start, after - 1)));
i = after;
}
other => unreachable!("unvalidated escape `\\{}` in InterpStr", other as char),
},
_ => {
let ch = source[i..].chars().next().unwrap();
chunk.push(ch);
i += ch.len_utf8();
}
}
}
if !chunk.is_empty() {
segments.push(InterpSegment::Chunk(chunk));
}
Ok(segments)
}
fn doc_block_open_at(source: &str, pos: usize) -> Option<usize> {
let bytes = source.as_bytes();
if !at_line_start(source, pos) {
return None;
}
let mut i = pos;
while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
i += 1;
}
if i + 3 > bytes.len() {
return None;
}
if &bytes[i..i + 3] != b"---" {
return None;
}
i += 3;
while i < bytes.len() && bytes[i] == b'-' {
i += 1;
}
while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t' || bytes[i] == b'\r') {
i += 1;
}
if i == bytes.len() {
return Some(i);
}
if bytes[i] == b'\n' {
return Some(i + 1);
}
None
}
fn doc_block_close(source: &str, mut pos: usize) -> Option<(usize, usize)> {
let bytes = source.as_bytes();
while pos < bytes.len() {
let line_start = pos;
let mut line_end = line_start;
while line_end < bytes.len() && bytes[line_end] != b'\n' {
line_end += 1;
}
if let Some(end) = doc_block_open_at(source, line_start) {
return Some((line_start, end));
}
pos = if line_end < bytes.len() {
line_end + 1
} else {
line_end
};
}
None
}
fn at_line_start(source: &str, pos: usize) -> bool {
if pos == 0 {
return true;
}
let bytes = source.as_bytes();
bytes[pos - 1] == b'\n'
}
pub fn doc_block_content(source: &str, span: Span) -> String {
let slice = &source[span.range()];
let after_open = match slice.find('\n') {
Some(i) => &slice[i + 1..],
None => return String::new(),
};
let bytes = after_open.as_bytes();
let mut i = bytes.len();
if i > 0 && bytes[i - 1] == b'\n' {
i -= 1;
}
while i > 0 && matches!(bytes[i - 1], b' ' | b'\t' | b'\r') {
i -= 1;
}
while i > 0 && bytes[i - 1] == b'-' {
i -= 1;
}
if i > 0 && bytes[i - 1] == b'\n' {
i -= 1;
}
let body = &after_open[..i];
let common: Option<usize> = body
.lines()
.filter(|l| !l.trim().is_empty())
.map(|l| l.bytes().take_while(|&b| b == b' ' || b == b'\t').count())
.min();
let strip = common.unwrap_or(0);
if strip == 0 {
return body.to_string();
}
let mut out = String::with_capacity(body.len());
let mut first = true;
for line in body.lines() {
if !first {
out.push('\n');
}
first = false;
if line.trim().is_empty() {
continue;
}
let leading: usize = line
.bytes()
.take_while(|&b| b == b' ' || b == b'\t')
.count();
let drop = strip.min(leading);
out.push_str(&line[drop..]);
}
out
}
pub fn comment_body(source: &str, span: Span) -> &str {
let slice = &source[span.range()];
slice.strip_prefix("--").unwrap_or(slice)
}
pub fn has_blank_line_between(source: &str, from: usize, to: usize) -> bool {
if to <= from {
return false;
}
let bytes = source.as_bytes();
let mut i = from;
while i < to {
if bytes[i] == b'\n' {
return true;
}
if !matches!(bytes[i], b' ' | b'\t' | b'\r') {
return false;
}
i += 1;
}
false
}
#[cfg(test)]
mod tests {
use super::*;
fn kinds(source: &str) -> Vec<TokenKind> {
tokenize(source)
.unwrap()
.into_iter()
.map(|t| t.kind)
.collect()
}
#[test]
fn keywords_and_idents() {
use TokenKind::*;
assert_eq!(
kinds("commons type fn where and true false Int String Bool foo bar"),
vec![
Commons, Type, Fn, Where, And, True, False, Int, String, Bool, Ident, Ident
],
);
}
#[test]
fn integer_and_string_literals() {
use TokenKind::*;
assert_eq!(
kinds(r#"0 42 "hello" "with\nescape""#),
vec![IntLit, IntLit, StrLit, StrLit]
);
}
#[test]
fn operators() {
use TokenKind::*;
assert_eq!(
kinds("-> == != <= >= && || + - * / ! = < > ( ) { } [ ] , : . @"),
vec![
Arrow, EqEq, BangEq, LtEq, GtEq, AmpAmp, PipePipe, Plus, Minus, Star, Slash, Bang,
Eq, Lt, Gt, LParen, RParen, LBrace, RBrace, LBracket, RBracket, Comma, Colon, Dot,
At,
],
);
}
#[test]
fn line_comments_emitted_as_trivia() {
use TokenKind::*;
let src = "-- a comment\ntype X = Int -- trailing\n";
assert_eq!(kinds(src), vec![Comment, Type, Ident, Eq, Int, Comment],);
}
#[test]
fn comment_body_extracts_text_after_marker() {
let toks = tokenize("-- hello world\n").unwrap();
assert_eq!(toks.len(), 1);
assert_eq!(toks[0].kind, TokenKind::Comment);
assert_eq!(
comment_body("-- hello world\n", toks[0].span),
" hello world"
);
}
#[test]
fn comment_does_not_consume_newline() {
let toks = tokenize("-- one\n-- two\n").unwrap();
assert_eq!(toks.len(), 2);
assert!(toks.iter().all(|t| t.kind == TokenKind::Comment));
}
#[test]
fn unterminated_string_is_error() {
let err = tokenize("\"oops\n").unwrap_err();
assert_eq!(err.category, "bynk.lex.unterminated_string");
}
#[test]
fn integer_overflow_is_error() {
let err = tokenize("99999999999999999999").unwrap_err();
assert_eq!(err.category, "bynk.lex.integer_overflow");
}
#[test]
fn unexpected_character_is_error() {
let err = tokenize("type X = Int $").unwrap_err();
assert_eq!(err.category, "bynk.lex.unexpected_character");
}
#[test]
fn v0_1_keywords() {
use TokenKind::*;
assert_eq!(
kinds("let if else Ok Err Result ValidationError"),
vec![Let, If, Else, Ok, Err, Result, ValidationError],
);
}
#[test]
fn question_token() {
use TokenKind::*;
assert_eq!(kinds("x?"), vec![Ident, Question]);
}
#[test]
fn v0_2_keywords() {
use TokenKind::*;
assert_eq!(
kinds("enum match Option record self Some None is"),
vec![Enum, Match, Option, Record, Self_, Some, None, Is],
);
}
#[test]
fn pipe_and_pipe_pipe_disambiguated() {
use TokenKind::*;
assert_eq!(kinds("| || |"), vec![Pipe, PipePipe, Pipe]);
}
#[test]
fn v0_7_keywords() {
use TokenKind::*;
assert_eq!(
kinds("assert expect mocks test"),
vec![Assert, Expect, Mocks, Test],
);
}
#[test]
fn fat_arrow_and_underscore() {
use TokenKind::*;
assert_eq!(kinds("_ =>"), vec![Underscore, FatArrow]);
}
#[test]
fn interp_string_is_one_token() {
use TokenKind::*;
assert_eq!(kinds(r#""Hello, \(name)!""#), vec![InterpStr]);
assert_eq!(kinds(r#""Hello, world""#), vec![StrLit]);
}
#[test]
fn interp_balances_nested_parens_and_strings() {
use TokenKind::*;
assert_eq!(kinds(r#""= \(f(x))""#), vec![InterpStr]);
assert_eq!(kinds(r#""= \(label(")"))""#), vec![InterpStr]);
assert_eq!(kinds(r#""out \("in \(x)")""#), vec![InterpStr]);
}
#[test]
fn escaped_open_paren_is_not_a_hole() {
use TokenKind::*;
assert_eq!(kinds(r#""a \\(b) c""#), vec![StrLit]);
}
#[test]
fn unterminated_hole_is_an_error() {
let err = tokenize("\"value \\(x + 1\n\"").unwrap_err();
assert_eq!(err.category, "bynk.lex.unterminated_interpolation");
}
#[test]
fn unterminated_interp_string_is_an_error() {
let err = tokenize("\"value \\(x) more\n").unwrap_err();
assert_eq!(err.category, "bynk.lex.unterminated_string");
}
#[test]
fn bad_escape_in_interp_string_is_an_error() {
let err = tokenize(r#""a \q \(x)""#).unwrap_err();
assert_eq!(err.category, "bynk.lex.bad_escape");
}
}