use std::str::Chars;
use thiserror::Error;
#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
pub struct Position {
pub offset: usize,
pub line: usize,
pub column: usize,
}
impl Position {
#[must_use]
pub const fn start() -> Self {
Self {
offset: 0,
line: 1,
column: 1,
}
}
}
#[derive(Clone, Debug, PartialEq)]
pub enum Token {
Symbol(String),
TypedSymbol {
name: String,
kind: String,
},
Bareword(String),
Timestamp(String),
Integer(i64),
Float(f64),
String(String),
Boolean(bool),
Nil,
Keyword(String),
LParen,
RParen,
}
#[derive(Clone, Debug, PartialEq)]
pub struct Spanned {
pub token: Token,
pub position: Position,
}
#[derive(Debug, Error, PartialEq)]
pub enum LexError {
#[error("unterminated string starting at {start:?}")]
UnterminatedString {
start: Position,
},
#[error("invalid escape '\\{escape}' at {pos:?}")]
InvalidEscape {
escape: char,
pos: Position,
},
#[error("invalid number {text:?} at {pos:?}")]
InvalidNumber {
text: String,
pos: Position,
},
#[error("invalid identifier {text:?} at {pos:?}")]
InvalidIdentifier {
text: String,
pos: Position,
},
#[error("unexpected byte {byte:#04x} at {pos:?}")]
UnexpectedByte {
byte: u8,
pos: Position,
},
#[error("invalid UTF-8 at {pos:?}")]
InvalidUtf8 {
pos: Position,
},
}
pub fn tokenize(input: &str) -> Result<Vec<Spanned>, LexError> {
let mut lexer = Lexer::new(input);
let mut out = Vec::new();
while let Some(spanned) = lexer.next_token()? {
out.push(spanned);
}
Ok(out)
}
struct Lexer<'a> {
input: &'a str,
chars: Chars<'a>,
pos: Position,
}
impl<'a> Lexer<'a> {
fn new(input: &'a str) -> Self {
Self {
input,
chars: input.chars(),
pos: Position::start(),
}
}
fn peek(&self) -> Option<char> {
self.chars.clone().next()
}
fn bump(&mut self) -> Option<char> {
let c = self.chars.next()?;
let len = c.len_utf8();
self.pos.offset += len;
if c == '\n' {
self.pos.line += 1;
self.pos.column = 1;
} else {
self.pos.column += 1;
}
Some(c)
}
fn skip_whitespace_and_comments(&mut self) {
while let Some(c) = self.peek() {
if c.is_whitespace() {
self.bump();
} else if c == ';' {
while let Some(cc) = self.peek() {
if cc == '\n' {
break;
}
self.bump();
}
} else {
break;
}
}
}
fn next_token(&mut self) -> Result<Option<Spanned>, LexError> {
self.skip_whitespace_and_comments();
let start = self.pos;
let Some(c) = self.peek() else {
return Ok(None);
};
let token = match c {
'(' => {
self.bump();
Token::LParen
}
')' => {
self.bump();
Token::RParen
}
'"' => self.lex_string(start)?,
'@' => self.lex_symbol_or_typed(start)?,
':' => self.lex_keyword(start)?,
'-' | '0'..='9' => self.lex_number_or_timestamp(start)?,
'a'..='z' | '_' => self.lex_bareword_or_reserved(start)?,
_ => {
let byte = c as u32;
#[allow(clippy::cast_possible_truncation)]
return Err(LexError::UnexpectedByte {
byte: byte as u8,
pos: start,
});
}
};
Ok(Some(Spanned {
token,
position: start,
}))
}
fn lex_string(&mut self, start: Position) -> Result<Token, LexError> {
self.bump(); let mut buf = String::new();
loop {
let pos = self.pos;
let Some(c) = self.bump() else {
return Err(LexError::UnterminatedString { start });
};
match c {
'"' => return Ok(Token::String(buf)),
'\\' => {
let Some(esc) = self.bump() else {
return Err(LexError::UnterminatedString { start });
};
let resolved = match esc {
'n' => '\n',
'r' => '\r',
't' => '\t',
'\\' => '\\',
'"' => '"',
other => return Err(LexError::InvalidEscape { escape: other, pos }),
};
buf.push(resolved);
}
other => buf.push(other),
}
}
}
fn lex_symbol_or_typed(&mut self, start: Position) -> Result<Token, LexError> {
self.bump(); let name_start = self.pos.offset;
self.consume_identifier();
let name_end = self.pos.offset;
let name = self.input[name_start..name_end].to_string();
if name.is_empty() || !is_valid_identifier_start(&name) {
return Err(LexError::InvalidIdentifier {
text: format!("@{name}"),
pos: start,
});
}
if self.peek() == Some(':') {
self.bump();
let kind_start = self.pos.offset;
self.consume_kind_annotation();
let kind_end = self.pos.offset;
let kind = self.input[kind_start..kind_end].to_string();
if kind.is_empty() || !is_valid_kind_annotation(&kind) {
return Err(LexError::InvalidIdentifier {
text: format!("@{name}:{kind}"),
pos: start,
});
}
Ok(Token::TypedSymbol { name, kind })
} else {
Ok(Token::Symbol(name))
}
}
fn lex_keyword(&mut self, start: Position) -> Result<Token, LexError> {
self.bump(); let name_start = self.pos.offset;
self.consume_identifier();
let name_end = self.pos.offset;
let name = self.input[name_start..name_end].to_string();
if name.is_empty() || !is_valid_identifier_start(&name) {
return Err(LexError::InvalidIdentifier {
text: format!(":{name}"),
pos: start,
});
}
Ok(Token::Keyword(name))
}
fn lex_number_or_timestamp(&mut self, start: Position) -> Result<Token, LexError> {
let begin = self.pos.offset;
while let Some(c) = self.peek() {
if c.is_ascii_digit() || matches!(c, '-' | '.' | ':' | 'T' | 'Z') {
self.bump();
} else {
break;
}
}
let end = self.pos.offset;
let text = &self.input[begin..end];
if looks_like_timestamp(text) {
return Ok(Token::Timestamp(text.to_string()));
}
if text.contains('.') {
text.parse::<f64>()
.map(Token::Float)
.map_err(|_| LexError::InvalidNumber {
text: text.to_string(),
pos: start,
})
} else {
text.parse::<i64>()
.map(Token::Integer)
.map_err(|_| LexError::InvalidNumber {
text: text.to_string(),
pos: start,
})
}
}
fn lex_bareword_or_reserved(&mut self, start: Position) -> Result<Token, LexError> {
let begin = self.pos.offset;
self.consume_identifier();
let end = self.pos.offset;
let text = &self.input[begin..end];
let token = match text {
"true" => Token::Boolean(true),
"false" => Token::Boolean(false),
"nil" => Token::Nil,
_ => {
if is_valid_identifier_start(text) {
Token::Bareword(text.to_string())
} else {
return Err(LexError::InvalidIdentifier {
text: text.to_string(),
pos: start,
});
}
}
};
Ok(token)
}
fn consume_identifier(&mut self) {
while let Some(c) = self.peek() {
if c.is_ascii_lowercase() || c.is_ascii_digit() || c == '_' {
self.bump();
} else {
break;
}
}
}
fn consume_kind_annotation(&mut self) {
while let Some(c) = self.peek() {
if c.is_ascii_alphabetic() || c.is_ascii_digit() {
self.bump();
} else {
break;
}
}
}
}
fn is_valid_identifier_start(s: &str) -> bool {
let mut chars = s.chars();
match chars.next() {
Some(c) if c.is_ascii_lowercase() || c == '_' => {
chars.all(|c| c.is_ascii_lowercase() || c.is_ascii_digit() || c == '_')
}
_ => false,
}
}
fn is_valid_kind_annotation(s: &str) -> bool {
let mut chars = s.chars();
match chars.next() {
Some(c) if c.is_ascii_uppercase() => chars.all(char::is_alphanumeric),
_ => false,
}
}
fn looks_like_timestamp(text: &str) -> bool {
let bytes = text.as_bytes();
if bytes.len() < 10 {
return false;
}
if !(bytes[..4].iter().all(u8::is_ascii_digit)
&& bytes[4] == b'-'
&& bytes[5..7].iter().all(u8::is_ascii_digit)
&& bytes[7] == b'-'
&& bytes[8..10].iter().all(u8::is_ascii_digit))
{
return false;
}
if bytes.len() == 10 {
return true;
}
if bytes[10] != b'T' {
return false;
}
let rest = &bytes[11..];
rest.contains(&b':')
}
#[cfg(test)]
mod tests {
use super::*;
fn first(tokens: &[Spanned]) -> &Token {
&tokens[0].token
}
#[test]
fn empty_input_produces_no_tokens() {
assert!(tokenize("").unwrap().is_empty());
assert!(tokenize(" \t\n ").unwrap().is_empty());
}
#[test]
fn parens_are_tokens() {
let t = tokenize("( )").unwrap();
assert_eq!(t.len(), 2);
assert_eq!(first(&t), &Token::LParen);
assert_eq!(&t[1].token, &Token::RParen);
}
#[test]
fn symbol_with_and_without_kind() {
let t = tokenize("@alice @alice:Agent").unwrap();
assert_eq!(first(&t), &Token::Symbol("alice".into()));
assert_eq!(
&t[1].token,
&Token::TypedSymbol {
name: "alice".into(),
kind: "Agent".into(),
}
);
}
#[test]
fn bareword_and_reserved_words() {
let t = tokenize("email true false nil sem").unwrap();
assert_eq!(first(&t), &Token::Bareword("email".into()));
assert_eq!(&t[1].token, &Token::Boolean(true));
assert_eq!(&t[2].token, &Token::Boolean(false));
assert_eq!(&t[3].token, &Token::Nil);
assert_eq!(&t[4].token, &Token::Bareword("sem".into()));
}
#[test]
fn numbers_distinguish_int_and_float() {
let t = tokenize("42 -17 3.14 -0.5").unwrap();
assert_eq!(first(&t), &Token::Integer(42));
assert_eq!(&t[1].token, &Token::Integer(-17));
match &t[2].token {
Token::Float(f) => assert!((f - 3.14).abs() < 1e-9),
other => panic!("expected Float, got {other:?}"),
}
match &t[3].token {
Token::Float(f) => assert!((f + 0.5).abs() < 1e-9),
other => panic!("expected Float, got {other:?}"),
}
}
#[test]
fn timestamps_are_distinct_from_numbers() {
let t = tokenize("2024-01-15 2026-04-17T10:00:00Z").unwrap();
match first(&t) {
Token::Timestamp(s) => assert_eq!(s, "2024-01-15"),
other => panic!("expected Timestamp, got {other:?}"),
}
match &t[1].token {
Token::Timestamp(s) => assert_eq!(s, "2026-04-17T10:00:00Z"),
other => panic!("expected Timestamp, got {other:?}"),
}
}
#[test]
fn strings_resolve_escapes() {
let t = tokenize(r#" "hello\nworld" "a\"b" "#).unwrap();
assert_eq!(first(&t), &Token::String("hello\nworld".into()));
assert_eq!(&t[1].token, &Token::String("a\"b".into()));
}
#[test]
fn keyword_stripped_of_colon() {
let t = tokenize(":src :confidence_threshold").unwrap();
assert_eq!(first(&t), &Token::Keyword("src".into()));
assert_eq!(&t[1].token, &Token::Keyword("confidence_threshold".into()));
}
#[test]
fn line_comments_skipped() {
let t = tokenize("; a comment\n@alice").unwrap();
assert_eq!(t.len(), 1);
assert_eq!(first(&t), &Token::Symbol("alice".into()));
}
#[test]
fn unterminated_string_errors() {
let result = tokenize(r#" "no close "#);
assert!(matches!(result, Err(LexError::UnterminatedString { .. })));
}
#[test]
fn invalid_escape_errors() {
let result = tokenize(r#" "\q" "#);
assert!(matches!(
result,
Err(LexError::InvalidEscape { escape: 'q', .. })
));
}
#[test]
fn unexpected_byte_errors() {
let result = tokenize("$");
assert!(matches!(result, Err(LexError::UnexpectedByte { .. })));
}
#[test]
fn positions_track_line_and_column() {
let t = tokenize("(\n@alice").unwrap();
assert_eq!(t[0].position.line, 1);
assert_eq!(t[0].position.column, 1);
assert_eq!(t[1].position.line, 2);
assert_eq!(t[1].position.column, 1);
}
}