use std::{
str::FromStr,
f64,
rc::Rc,
};
use crate::common::{
source::Source,
span::{ Span, Spanned },
data::Data,
};
use crate::compiler::{
token::Token,
syntax::Syntax,
};
type Bite = (Token, usize);
pub fn lex(source: Rc<Source>) -> Result<Vec<Spanned<Token>>, Syntax> {
let mut lexer = Lexer::new(&source);
return lexer.all();
}
pub struct Lexer {
source: Rc<Source>,
offset: usize,
}
impl Lexer {
pub fn new(source: &Rc<Source>) -> Lexer {
Lexer { source: Rc::clone(source), offset: 0 }
}
pub fn all(&mut self) -> Result<Vec<Spanned<Token>>, Syntax> {
let mut tokens = vec![];
while self.remaining().len() != 0 {
self.strip();
self.offset += Lexer::comment(&self.remaining());
self.offset += Lexer::multi_comment(&self.remaining());
self.strip();
let (kind, consumed) = match self.step() {
Ok(k) => k,
Err(e) => return Err(
Syntax::error(&e, &Span::point(&self.source, self.offset))
),
};
tokens.push(Spanned::new(
kind,
Span::new(&self.source, self.offset, consumed),
));
self.offset += consumed;
}
tokens.push(Spanned::new(Token::End, Span::empty()));
return Ok(tokens);
}
pub fn step(&self) -> Result<Bite, String> {
let source = self.remaining();
let rules: Vec<Box<dyn Fn(&str) -> Result<Bite, String>>> = vec![
Box::new(Lexer::unit),
Box::new(Lexer::open_bracket),
Box::new(Lexer::close_bracket),
Box::new(Lexer::open_paren),
Box::new(Lexer::close_paren),
Box::new(Lexer::syntax),
Box::new(Lexer::assign),
Box::new(Lexer::lambda),
Box::new(Lexer::compose),
Box::new(Lexer::pair),
Box::new(Lexer::add),
Box::new(Lexer::sub),
Box::new(Lexer::mul),
Box::new(Lexer::div),
Box::new(Lexer::equal),
Box::new(Lexer::remainder),
Box::new(Lexer::magic),
Box::new(Lexer::print),
Box::new(Lexer::sep),
Box::new(Lexer::boolean),
Box::new(Lexer::real),
Box::new(Lexer::integer),
Box::new(Lexer::string),
Box::new(Lexer::keyword),
Box::new(Lexer::label),
Box::new(Lexer::symbol),
];
let mut best = Err("Unexpected token".to_string());
for rule in &rules {
if let Ok((k, c)) = rule(source) {
match best {
Err(_) => best = Ok((k, c)),
Ok((_, o)) if c > o => best = Ok((k, c)),
Ok(_) => (),
}
}
}
return best;
}
pub fn remaining(&self) -> &str {
return &self.source.contents[self.offset..]
}
pub fn strip(&mut self) {
let mut len = 0;
for char in self.remaining().chars() {
if !char.is_whitespace() || char == '\n' {
break;
}
len += char.len_utf8();
}
self.offset += len;
}
pub fn expect(source: &str, literal: &str) -> Result<usize, String> {
if literal.len() > source.len() {
return Err("Unexpected EOF while lexing".to_string());
}
match &source.as_bytes()[..literal.len()] {
s if s == literal.as_bytes() => Ok(literal.len()),
_ => Err(format!("Expected '{}'", source)),
}
}
pub fn eat_digits(source: &str) -> Result<usize, String> {
let mut len = 0;
for char in source.chars() {
match char {
n if n.is_digit(10) => len += n.len_utf8(),
_ => break,
}
}
return if len == 0 { Err("Expected digits".to_string()) } else { Ok(len) };
}
pub fn literal(source: &str, literal: &str, kind: Token) -> Result<Bite, String> {
Ok((kind, Lexer::expect(source, literal)?))
}
pub fn open_bracket(source: &str) -> Result<Bite, String> {
Lexer::literal(source, "{", Token::OpenBracket)
}
pub fn close_bracket(source: &str) -> Result<Bite, String> {
Lexer::literal(source, "}", Token::CloseBracket)
}
pub fn unit(source: &str) -> Result<Bite, String> {
Lexer::literal(source, "()", Token::Unit)
}
pub fn open_paren(source: &str) -> Result<Bite, String> {
Lexer::literal(source, "(", Token::OpenParen)
}
pub fn close_paren(source: &str) -> Result<Bite, String> {
Lexer::literal(source, ")", Token::CloseParen)
}
pub fn syntax(source: &str) -> Result<Bite, String> {
Lexer::literal(source, "syntax", Token::Syntax)
}
pub fn assign(source: &str) -> Result<Bite, String> {
Lexer::literal(source, "=", Token::Assign)
}
pub fn lambda(source: &str) -> Result<Bite, String> {
Lexer::literal(source, "->", Token::Lambda)
}
pub fn compose(source: &str) -> Result<Bite, String> {
Lexer::literal(source, ".", Token::Compose)
}
pub fn pair(source: &str) -> Result<Bite, String> {
Lexer::literal(source, ",", Token::Pair)
}
pub fn add(source: &str) -> Result<Bite, String> {
Lexer::literal(source, "+", Token::Add)
}
pub fn sub(source: &str) -> Result<Bite, String> {
Lexer::literal(source, "-", Token::Sub)
}
pub fn mul(source: &str) -> Result<Bite, String> {
Lexer::literal(source, "*", Token::Mul)
}
pub fn div(source: &str) -> Result<Bite, String> {
Lexer::literal(source, "/", Token::Div)
}
pub fn equal(source: &str) -> Result<Bite, String> {
Lexer::literal(source, "==", Token::Equal)
}
pub fn remainder(source: &str) -> Result<Bite, String> {
Lexer::literal(source, "%", Token::Rem)
}
pub fn print(source: &str) -> Result<Bite, String> {
Lexer::literal(source, "print", Token::Print)
}
pub fn magic(source: &str) -> Result<Bite, String> {
Lexer::literal(source, "magic", Token::Magic)
}
pub fn comment(source: &str) -> usize {
let mut len = match Lexer::expect(source, "--") {
Ok(n) => n,
Err(_) => { return 0; },
};
for char in source[len..].chars() {
if char == '\n' { break; }
len += char.len_utf8();
}
return len;
}
pub fn multi_comment(source: &str) -> usize {
let mut len: usize = match Lexer::expect(source, "-{") {
Ok(n) => n,
Err(_) => { return 0; },
};
for char in source[len..].chars() {
if let Ok(_) = Lexer::expect(&source[len..], "-{") {
len += Lexer::multi_comment(&source[len..]);
} else if let Ok(end) = Lexer::expect(&source[len..], "}-") {
len += end; break;
} else {
len += char.len_utf8();
}
}
return len;
}
pub fn identifier(source: &str) -> Result<Bite, String> {
let mut len = 0;
for char in source.chars() {
match char {
a if a.is_alphanumeric()
|| "_".contains(a)
=> { len += a.len_utf8() },
_ => { break; },
}
}
if len == 0 {
return Err("Expected an alphanumeric character".to_string());
}
let first = source.chars().next().unwrap();
match first {
n if n.is_numeric() => Err(
"Can not start with a numeric character".to_string()
),
s if s.is_uppercase() => Ok((Token::Label, len)),
_ => Ok((Token::Symbol, len)),
}
}
pub fn symbol(source: &str) -> Result<Bite, String> {
if let symbol @ (Token::Symbol, _) = Lexer::identifier(source)? {
Ok(symbol)
} else {
Err("Expected a symbol".to_string())
}
}
pub fn label(source: &str) -> Result<Bite, String> {
if let label @ (Token::Label, _) = Lexer::identifier(source)? {
Ok(label)
} else {
Err("Expected a Label".to_string())
}
}
pub fn keyword(source: &str) -> Result<Bite, String> {
let mut len = 0;
len += Lexer::expect(&source, "'")?;
if let (Token::Symbol, l) = Lexer::identifier(&source[len..])? {
let keyword = source[len..len+l].to_string();
Ok((Token::Keyword(keyword), len + l))
} else {
Err("Expected a pseudokeyword".to_string())
}
}
pub fn real(source: &str) -> Result<Bite, String> {
let mut len = 0;
len += Lexer::eat_digits(source)?;
len += Lexer::expect(&source[len..], ".")?;
len += Lexer::eat_digits(&source[len..])?;
let number = match f64::from_str(&source[..len]) {
Ok(n) => n,
Err(_) => panic!("Could not convert source to supposed real")
};
return Ok((Token::Number(Data::Real(number)), len));
}
pub fn integer(source: &str) -> Result<Bite, String> {
let mut len = 0;
len += Lexer::eat_digits(source)?;
let number = match i64::from_str(&source[..len]) {
Ok(n) => n,
Err(_) => panic!("Could not convert source to supposed integer"),
};
return Ok((Token::Number(Data::Integer(number)), len));
}
pub fn string(source: &str) -> Result<Bite, String> {
let mut len = 0;
let mut escape = false;
let mut string = "".to_string();
len += Lexer::expect(source, "\"")?;
for c in source[len..].chars() {
len += c.len_utf8();
if escape {
escape = false;
string.push(match c {
'"' => '"',
'\\' => '\\',
'n' => '\n',
't' => '\t',
'r' => '\r',
o => return Err(format!("Unknown escape code '\\{}'", o)),
})
} else {
match c {
'\\' => escape = true,
'\"' => return Ok((Token::String(Data::String(string)), len)),
c => string.push(c),
}
}
}
return Err("Unexpected EOF while parsing string literal".to_string());
}
pub fn boolean(source: &str) -> Result<Bite, String> {
for (lit, val) in [
("true", true),
("false", false),
].iter() {
if let x @ Ok(_) = Lexer::literal(
source, lit, Token::Boolean(Data::Boolean(*val))
) { return x; }
}
return Err("Expected a boolean".to_string());
}
pub fn sep(source: &str) -> Result<Bite, String> {
let mut chars = source.chars();
let c = chars.next()
.ok_or("Unexpected EOF while parsing")?;
if c != '\n' && c != ';' {
return Err("Expected a separator such as a newline or semicolon".to_string())
}
let mut len = c.len_utf8();
for c in chars {
if c != ';' && !c.is_whitespace() {
break;
}
len += c.len_utf8();
}
return Ok((Token::Sep, len));
}
}
#[cfg(test)]
mod test {
use super::*;
use crate::common::data::Data;
#[test]
fn empty() {
let result = lex(Source::source(""));
let target: Result<Vec<Spanned<Token>>, Syntax> =
Ok(vec![Spanned::new(Token::End, Span::empty())]);
assert_eq!(result, target);
}
#[test]
fn assignment() {
let source = Source::source("heck = true");
let result = vec![
Spanned::new(Token::Symbol, Span::new(&source, 0, 4)),
Spanned::new(Token::Assign, Span::new(&source, 5, 1)),
Spanned::new(Token::Boolean(Data::Boolean(true)), Span::new(&source, 7, 4)),
Spanned::new(Token::End, Span::empty()),
];
assert_eq!(lex(source), Ok(result));
}
#[test]
fn whitespace() {
let source = Source::source(" true ; ");
let result = vec![
Spanned::new(Token::Boolean(Data::Boolean(true)), Span::new(&source, 2, 4)),
Spanned::new(Token::Sep, Span::new(&source, 8, 3)),
Spanned::new(Token::End, Span::empty()),
];
assert_eq!(lex(source), Ok(result));
}
#[test]
fn block() {
let source = Source::source("{\n\thello = true\n\thello\n}");
let result = vec![
Spanned::new(Token::OpenBracket, Span::new(&source, 0, 1)),
Spanned::new(Token::Sep, Span::new(&source, 1, 2)),
Spanned::new(Token::Symbol, Span::new(&source, 3, 5)),
Spanned::new(Token::Assign, Span::new(&source, 9, 1)),
Spanned::new(Token::Boolean(Data::Boolean(true)), Span::new(&source, 11, 4)),
Spanned::new(Token::Sep, Span::new(&source, 15, 2)),
Spanned::new(Token::Symbol, Span::new(&source, 17, 5)),
Spanned::new(Token::Sep, Span::new(&source, 22, 1)),
Spanned::new(Token::CloseBracket, Span::new(&source, 23, 1)),
Spanned::new(Token::End, Span::empty()),
];
assert_eq!(lex(source), Ok(result));
}
#[test]
fn function() {
let source = Source::source("identity = x -> x\nidentity (identity \"heck\")");
let result = vec![
Spanned::new(Token::Symbol, Span::new(&source, 0, 8)),
Spanned::new(Token::Assign, Span::new(&source, 9, 1)),
Spanned::new(Token::Symbol, Span::new(&source, 11, 1)),
Spanned::new(Token::Lambda, Span::new(&source, 13, 2)),
Spanned::new(Token::Symbol, Span::new(&source, 16, 1)),
Spanned::new(Token::Sep, Span::new(&source, 17, 1)),
Spanned::new(Token::Symbol, Span::new(&source, 18, 8)),
Spanned::new(Token::OpenParen, Span::new(&source, 27, 1)),
Spanned::new(Token::Symbol, Span::new(&source, 28, 8)),
Spanned::new(Token::String(Data::String("heck".to_string())), Span::new(&source, 37, 6)),
Spanned::new(Token::CloseParen, Span::new(&source, 43, 1)),
Spanned::new(Token::End, Span::empty()),
];
assert_eq!(lex(source), Ok(result));
}
fn test_literal(literal: &str, token: Token, length: usize) -> bool {
let result = Lexer::new(&Source::source(literal)).step();
match result {
Ok(v) => v == (token, length),
Err(_) => false
}
}
#[test]
fn boolean() {
if !test_literal("true", Token::Boolean(Data::Boolean(true)), 4) { panic!() }
if !test_literal("false", Token::Boolean(Data::Boolean(false)), 5) { panic!() }
}
#[test]
fn assign() {
if !test_literal("=", Token::Assign, 1) { panic!() }
}
#[test]
fn symbol() {
if !test_literal("orchard", Token::Symbol, 7) { panic!() }
}
#[test]
fn sep() {
if !test_literal(
"\n heck",
Token::Sep,
3,
) { panic!() }
if !test_literal(
";\n ; heck",
Token::Sep,
5,
) { panic!() }
}
#[test]
fn real() {
if !test_literal(
"2.0",
Token::Number(Data::Real(2.0)),
3,
) { panic!() }
if !test_literal(
"210938.2221",
Token::Number(Data::Real(210938.2221)),
11,
) { panic!() }
}
#[test]
fn string() {
let source = "\"heck\"";
if !test_literal(
source,
Token::String(Data::String("heck".to_string())),
source.len(),
) { panic!() }
let escape = "\"I said, \\\"Hello, world!\\\" didn't I?\"";
if !test_literal(
escape,
Token::String(Data::String("I said, \"Hello, world!\" didn't I?".to_string())),
escape.len(),
) { panic!() }
let unicode = "\"Yo 👋! Ünícode µ works just fine 🚩! うん、気持ちいい!\"";
if !test_literal(
unicode,
Token::String(Data::String("Yo 👋! Ünícode µ works just fine 🚩! うん、気持ちいい!".to_string())),
unicode.len(),
) { panic!() }
}
#[test]
fn comma() {
let source = Source::source("heck\\ man");
let tokens = lex(source.clone());
assert_eq!(tokens, Err(Syntax::error("Unexpected token", &Span::new(&source, 4, 0))));
}
}