#[derive(Clone)]
pub enum TokenType {
Symbol(std::sync::Arc<[char]>),
Atom(std::sync::Arc<[char]>),
String(std::sync::Arc<[char]>),
Char(char),
Number(f64),
Seperator,
SentenceSeperator,
FuncListOpen,
FuncListClose,
ListOpen,
ListClose,
}
impl std::fmt::Debug for TokenType {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
TokenType::Symbol(sym) => write!(f, "Id({})", sym.iter().collect::<String>()),
TokenType::Atom(a) => write!(f, "Atom({})", a.iter().collect::<String>()),
TokenType::String(s) => write!(f, "Str({})", s.iter().collect::<String>()),
TokenType::Char(c) => write!(f, "Ch({c})"),
TokenType::Number(n) => write!(f, "Num({n})"),
TokenType::Seperator => write!(f, "S"),
TokenType::SentenceSeperator => write!(f, "SS"),
TokenType::FuncListOpen => write!(f, "FnO"),
TokenType::FuncListClose => write!(f, "FnC"),
TokenType::ListOpen => write!(f, "LstO"),
TokenType::ListClose => write!(f, "LstC"),
}
}
}
#[derive(Clone)]
pub struct Token {
pub value: TokenType,
pub location: ((usize, usize), (usize, usize)),
}
impl std::fmt::Debug for Token {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(
f,
"Token<{:?}, [({}, {}), ({}, {})]>",
self.value, self.location.0.0, self.location.0.1, self.location.1.0, self.location.1.1,
)
}
}
struct Lexer<'a> {
chars: std::iter::Peekable<std::str::Chars<'a>>,
line: usize,
col: usize,
}
impl<'a> Lexer<'a> {
fn new(input: &'a str) -> Self {
Self {
chars: input.chars().peekable(),
line: 1,
col: 0,
}
}
fn peek(&mut self) -> Option<char> { self.chars.peek().copied() }
fn next_char(&mut self) -> Option<char> {
let ch = self.chars.next()?;
if ch == '\n' {
self.line += 1;
self.col = 0;
} else {
self.col += 1;
}
Some(ch)
}
fn current_pos(&self) -> (usize, usize) { (self.line, self.col + 1) }
fn end_pos(&self) -> (usize, usize) { (self.line, self.col) }
}
pub fn source_to_token(source: &str) -> Result<Vec<Token>, std::sync::Arc<str>> {
let mut lexer = Lexer::new(source);
let mut tokens = Vec::new();
let mut comment_depth = 0;
while let Some(ch) = lexer.peek() {
let start = lexer.current_pos();
if ch == '(' && lexer.peek_second() == Some('*') {
let _ = lexer.next_char();
let _ = lexer.next_char(); comment_depth += 1;
continue;
}
if ch == '*' && comment_depth > 0 && lexer.peek_second() == Some(')') {
let _ = lexer.next_char();
let _ = lexer.next_char(); comment_depth -= 1;
continue;
}
if comment_depth > 0 {
lexer.next_char();
continue;
}
match ch {
' ' | '\t' | '\r' | '\n' => {
lexer.next_char();
}
'"' => tokens.push(lex_string(&mut lexer)?),
'#' => tokens.push(lex_atom_or_char(&mut lexer)?),
',' => tokens.push(make_token(&mut lexer, TokenType::Seperator)),
';' => tokens.push(make_token(&mut lexer, TokenType::SentenceSeperator)),
'[' => tokens.push(make_token(&mut lexer, TokenType::FuncListOpen)),
']' => tokens.push(make_token(&mut lexer, TokenType::FuncListClose)),
'{' => tokens.push(make_token(&mut lexer, TokenType::ListOpen)),
'}' => tokens.push(make_token(&mut lexer, TokenType::ListClose)),
c if c.is_ascii_digit() || c == '+' || c == '-' => tokens.push(lex_number(&mut lexer)?),
c if is_symbol_start(c) => tokens.push(lex_symbol(&mut lexer)?),
_ => {
return Err(std::sync::Arc::from(format!(
"Error[ksl::token::source_to_token]: Invalid token `{}` at `({}, {})`.",
ch, start.0, start.1
)));
}
}
}
if comment_depth == 0 {
Ok(tokens)
} else {
Err(std::sync::Arc::from(
"Error[ksl::token::source_to_token]: Unclosed comment.",
))
}
}
impl<'a> Lexer<'a> {
fn peek_second(&self) -> Option<char> {
let mut it = self.chars.clone();
it.next();
it.next()
}
}
fn make_token(lexer: &mut Lexer, val: TokenType) -> Token {
let start = lexer.current_pos();
lexer.next_char();
Token {
value: val,
location: (start, lexer.end_pos()),
}
}
fn is_symbol_start(c: char) -> bool { (!c.is_ascii_punctuation() && !c.is_whitespace() && !c.is_ascii_digit()) || c == '_' }
fn is_symbol_cont(c: char) -> bool { (!c.is_ascii_punctuation() && !c.is_whitespace()) || c == '_' || c == '\'' }
fn lex_string(lexer: &mut Lexer) -> Result<Token, std::sync::Arc<str>> {
let start = lexer.current_pos();
let _ = lexer.next_char(); let mut buf = Vec::new();
while let Some(c) = lexer.next_char() {
if c == '"' {
return Ok(Token {
value: TokenType::String(std::sync::Arc::from(buf)),
location: (start, lexer.end_pos()),
});
}
buf.push(c);
}
Err(std::sync::Arc::from(format!(
"Error[ksl::token::lex_string]: Unclosed string at `({}, {})`.",
start.0, start.1
)))
}
fn lex_number(lexer: &mut Lexer) -> Result<Token, std::sync::Arc<str>> {
let start = lexer.current_pos();
let mut s = String::new();
while let Some(c) = lexer.peek() {
if (c.is_ascii_digit() || "+-.e".contains(c))
&& let Some(ch) = lexer.next_char()
{
s.push(ch);
} else {
break;
}
}
s.parse::<f64>()
.map(|n| Token {
value: TokenType::Number(n),
location: (start, lexer.end_pos()),
})
.map_err(|_| {
std::sync::Arc::from(format!(
concat!(
"Error[ksl::token::lex_number]: ",
"Invalid number string `{}` at `({}, {})`."
),
s, start.0, start.1
))
})
}
fn lex_atom_or_char(lexer: &mut Lexer) -> Result<Token, std::sync::Arc<str>> {
let start = lexer.current_pos();
let _ = lexer.next_char(); match lexer.peek() {
Some(c) if c.is_ascii_digit() => {
let mut s = String::new();
while let Some(digit) = lexer.peek() {
if digit.is_ascii_digit()
&& let Some(ch) = lexer.next_char()
{
s.push(ch);
} else {
break;
}
}
let code = s.parse::<u32>().map_err(|_| {
std::sync::Arc::from(format!(
"Error[ksl::token::lex_atom_or_char]: Invalid number string `{}` at `({}, {})`.",
s, start.0, start.1
))
})?;
let ch = char::from_u32(code).ok_or_else(|| {
std::sync::Arc::from(format!(
"Error[ksl::token::lex_atom_or_char]: Invalid unicode `{}` at `({}, {})`.",
code, start.0, start.1
))
})?;
Ok(Token {
value: TokenType::Char(ch),
location: (start, lexer.end_pos()),
})
}
Some(c) if !c.is_ascii_punctuation() && !c.is_whitespace() => {
let mut buf = Vec::new();
while let Some(cont) = lexer.peek() {
if is_symbol_cont(cont)
&& let Some(ch) = lexer.next_char()
{
buf.push(ch);
} else {
break;
}
}
Ok(Token {
value: TokenType::Atom(std::sync::Arc::from(buf)),
location: (start, lexer.end_pos()),
})
}
_ => Err(std::sync::Arc::from(format!(
"Error[ksl::token::lex_atom_or_char]: Invalid atom at `({}, {})`.",
start.0, start.1
))),
}
}
fn lex_symbol(lexer: &mut Lexer) -> Result<Token, std::sync::Arc<str>> {
let start = lexer.current_pos();
let mut buf = Vec::new();
while let Some(c) = lexer.peek() {
if is_symbol_cont(c)
&& let Some(ch) = lexer.next_char()
{
buf.push(ch);
} else {
break;
}
}
Ok(Token {
value: TokenType::Symbol(std::sync::Arc::from(buf)),
location: (start, lexer.end_pos()),
})
}