use std::{
str::FromStr,
f64,
rc::Rc,
};
use crate::common::{
source::Source,
span::{ Span, Spanned },
data::Data,
};
use crate::compiler::{
token::Token,
syntax::Syntax,
};
type Bite = (Token, usize);
pub fn lex(source: Rc<Source>) -> Result<Vec<Spanned<Token>>, Syntax> {
let mut lexer = Lexer::new(&source);
return lexer.all();
}
pub struct Lexer {
source: Rc<Source>,
offset: usize,
}
impl Lexer {
pub fn new(source: &Rc<Source>) -> Lexer {
Lexer { source: Rc::clone(source), offset: 0 }
}
pub fn all(&mut self) -> Result<Vec<Spanned<Token>>, Syntax> {
let mut tokens = vec![];
while self.remaining().len() != 0 {
self.strip();
let (kind, consumed) = match self.step() {
Ok(k) => k,
Err(e) => return Err(
Syntax::error(&e, Span::point(&self.source, self.offset))
),
};
tokens.push(Spanned::new(
kind,
Span::new(&self.source, self.offset, consumed),
));
self.offset += consumed;
}
tokens.push(Spanned::new(Token::End, Span::empty()));
return Ok(tokens);
}
pub fn step(&self) -> Result<Bite, String> {
let source = self.remaining();
let rules: Vec<Box<dyn Fn(&str) -> Result<Bite, String>>> = vec![
Box::new(|s| Lexer::unit(s) ),
Box::new(|s| Lexer::open_bracket(s) ),
Box::new(|s| Lexer::close_bracket(s)),
Box::new(|s| Lexer::open_paren(s) ),
Box::new(|s| Lexer::close_paren(s) ),
Box::new(|s| Lexer::assign(s) ),
Box::new(|s| Lexer::lambda(s) ),
Box::new(|s| Lexer::print(s) ),
Box::new(|s| Lexer::sep(s) ),
Box::new(|s| Lexer::boolean(s)),
Box::new(|s| Lexer::real(s) ),
Box::new(|s| Lexer::string(s)),
Box::new(|s| Lexer::symbol(s) ),
];
let mut best = Err("Unexpected token".to_string());
for rule in &rules {
if let Ok((k, c)) = rule(source) {
match best {
Err(_) => best = Ok((k, c)),
Ok((_, o)) if c > o => best = Ok((k, c)),
Ok(_) => (),
}
}
}
return best;
}
pub fn remaining(&self) -> &str {
return &self.source.contents[self.offset..]
}
pub fn strip(&mut self) {
let mut len = 0;
for char in self.remaining().chars() {
if !char.is_whitespace() || char == '\n' {
break;
}
len += 1;
}
self.offset += len;
}
pub fn expect(source: &str, literal: &str) -> Result<usize, String> {
if literal.len() > source.len() {
return Err("Unexpected EOF while lexing".to_string());
}
match &source.as_bytes()[..literal.len()] {
s if s == literal.as_bytes() => Ok(literal.len()),
_ => Err(format!("Expected '{}'", source)),
}
}
pub fn eat_digits(source: &str) -> Result<usize, String> {
let mut len = 0;
for char in source.chars() {
match char {
n if n.is_digit(10) => len += 1,
_ => break,
}
}
return if len == 0 { Err("Expected digits".to_string()) } else { Ok(len) };
}
pub fn literal(source: &str, literal: &str, kind: Token) -> Result<Bite, String> {
Ok((kind, Lexer::expect(source, literal)?))
}
pub fn open_bracket(source: &str) -> Result<Bite, String> {
Lexer::literal(source, "{", Token::OpenBracket)
}
pub fn close_bracket(source: &str) -> Result<Bite, String> {
Lexer::literal(source, "}", Token::CloseBracket)
}
pub fn unit(source: &str) -> Result<Bite, String> {
Lexer::literal(source, "()", Token::Unit)
}
pub fn open_paren(source: &str) -> Result<Bite, String> {
Lexer::literal(source, "(", Token::OpenParen)
}
pub fn close_paren(source: &str) -> Result<Bite, String> {
Lexer::literal(source, ")", Token::CloseParen)
}
pub fn assign(source: &str) -> Result<Bite, String> {
Lexer::literal(source, "=", Token::Assign)
}
pub fn lambda(source: &str) -> Result<Bite, String> {
Lexer::literal(source, "->", Token::Lambda)
}
pub fn print(source: &str) -> Result<Bite, String> {
Lexer::literal(source, "print", Token::Print)
}
pub fn symbol(source: &str) -> Result<Bite, String> {
let mut len = 0;
for char in source.chars() {
if !char.is_ascii_alphanumeric() {
break;
}
len += 1;
}
return match len {
0 => Err("Expected a symbol".to_string()),
l => Ok((Token::Symbol, l)),
};
}
pub fn real(source: &str) -> Result<Bite, String> {
let mut len = 0;
len += Lexer::eat_digits(source)?;
len += Lexer::expect(&source[len..], ".")?;
len += Lexer::eat_digits(&source[len..])?;
let number = match f64::from_str(&source[..len]) {
Ok(n) => n,
Err(_) => panic!("Could not convert source to supposed real")
};
return Ok((Token::Number(Data::Real(number)), len));
}
pub fn string(source: &str) -> Result<Bite, String> {
let mut len = 0;
let mut escape = false;
let mut string = "".to_string();
len += Lexer::expect(source, "\"")?;
for c in source[len..].chars() {
len += 1;
if escape {
escape = false;
string.push(match c {
'"' => '"',
'\\' => '\\',
'n' => '\n',
't' => '\t',
o => return Err(format!("Unknown escape code '\\{}'", o)),
})
} else {
match c {
'\\' => escape = true,
'\"' => return Ok((Token::String(Data::String(string)), len)),
c => string.push(c),
}
}
}
return Err("Unexpected EOF while parsing string literal".to_string());
}
pub fn boolean(source: &str) -> Result<Bite, String> {
for (lit, val) in [
("true", true),
("false", false),
].iter() {
if let x @ Ok(_) = Lexer::literal(
source, lit, Token::Boolean(Data::Boolean(*val))
) { return x; }
}
return Err("Expected a boolean".to_string());
}
pub fn sep(source: &str) -> Result<Bite, String> {
let mut chars = source.chars();
let c = chars.next()
.ok_or("Unexpected EOF while parsing")?;
if c != '\n' && c != ';' {
return Err("Expected a separator such as a newline or semicolon".to_string())
}
let mut len = 1;
for c in chars {
if c != ';' && !c.is_whitespace() {
break;
}
len += 1;
}
return Ok((Token::Sep, len));
}
}
#[cfg(test)]
mod test {
use super::*;
use crate::common::data::Data;
#[test]
fn empty() {
let result = lex(Source::source(""));
let target: Result<Vec<Spanned<Token>>, Syntax> =
Ok(vec![Spanned::new(Token::End, Span::empty())]);
assert_eq!(result, target);
}
#[test]
fn assignment() {
let source = Source::source("heck = true");
let result = vec![
Spanned::new(Token::Symbol, Span::new(&source, 0, 4)),
Spanned::new(Token::Assign, Span::new(&source, 5, 1)),
Spanned::new(Token::Boolean(Data::Boolean(true)), Span::new(&source, 7, 4)),
Spanned::new(Token::End, Span::empty()),
];
assert_eq!(lex(source), Ok(result));
}
#[test]
fn whitespace() {
let source = Source::source(" true ; ");
let result = vec![
Spanned::new(Token::Boolean(Data::Boolean(true)), Span::new(&source, 2, 4)),
Spanned::new(Token::Sep, Span::new(&source, 8, 3)),
Spanned::new(Token::End, Span::empty()),
];
assert_eq!(lex(source), Ok(result));
}
#[test]
fn block() {
let source = Source::source("{\n\thello = true\n\thello\n}");
let result = vec![
Spanned::new(Token::OpenBracket, Span::new(&source, 0, 1)),
Spanned::new(Token::Sep, Span::new(&source, 1, 2)),
Spanned::new(Token::Symbol, Span::new(&source, 3, 5)),
Spanned::new(Token::Assign, Span::new(&source, 9, 1)),
Spanned::new(Token::Boolean(Data::Boolean(true)), Span::new(&source, 11, 4)),
Spanned::new(Token::Sep, Span::new(&source, 15, 2)),
Spanned::new(Token::Symbol, Span::new(&source, 17, 5)),
Spanned::new(Token::Sep, Span::new(&source, 22, 1)),
Spanned::new(Token::CloseBracket, Span::new(&source, 23, 1)),
Spanned::new(Token::End, Span::empty()),
];
assert_eq!(lex(source), Ok(result));
}
#[test]
fn function() {
let source = Source::source("identity = x -> x\nidentity (identity \"heck\")");
let result = vec![
Spanned::new(Token::Symbol, Span::new(&source, 0, 8)),
Spanned::new(Token::Assign, Span::new(&source, 9, 1)),
Spanned::new(Token::Symbol, Span::new(&source, 11, 1)),
Spanned::new(Token::Lambda, Span::new(&source, 13, 2)),
Spanned::new(Token::Symbol, Span::new(&source, 16, 1)),
Spanned::new(Token::Sep, Span::new(&source, 17, 1)),
Spanned::new(Token::Symbol, Span::new(&source, 18, 8)),
Spanned::new(Token::OpenParen, Span::new(&source, 27, 1)),
Spanned::new(Token::Symbol, Span::new(&source, 28, 8)),
Spanned::new(Token::String(Data::String("heck".to_string())), Span::new(&source, 37, 6)),
Spanned::new(Token::CloseParen, Span::new(&source, 43, 1)),
Spanned::new(Token::End, Span::empty()),
];
assert_eq!(lex(source), Ok(result));
}
fn test_literal(literal: &str, token: Token, length: usize) -> bool {
let result = Lexer::new(&Source::source(literal)).step();
match result {
Ok(v) => v == (token, length),
Err(_) => false
}
}
#[test]
fn boolean() {
if !test_literal("true", Token::Boolean(Data::Boolean(true)), 4) { panic!() }
if !test_literal("false", Token::Boolean(Data::Boolean(false)), 5) { panic!() }
}
#[test]
fn assign() {
if !test_literal("=", Token::Assign, 1) { panic!() }
}
#[test]
fn symbol() {
if !test_literal("orchard", Token::Symbol, 7) { panic!() }
}
#[test]
fn sep() {
if !test_literal(
"\n heck",
Token::Sep,
3,
) { panic!() }
if !test_literal(
";\n ; heck",
Token::Sep,
5,
) { panic!() }
}
#[test]
fn real() {
if !test_literal(
"2.0",
Token::Number(Data::Real(2.0)),
3,
) { panic!() }
if !test_literal(
"210938.2221",
Token::Number(Data::Real(210938.2221)),
11,
) { panic!() }
}
#[test]
fn string() {
let source = "\"heck\"";
if !test_literal(
source,
Token::String(Data::String("heck".to_string())),
source.len(),
) { panic!() }
let escape = "\"I said, \\\"Hello, world!\\\" didn't I?\"";
if !test_literal(
escape,
Token::String(Data::String("I said, \"Hello, world!\" didn't I?".to_string())),
escape.len(),
) { panic!() }
let unicode = "\"Yo 👋! Ünícode µ works just fine 🚩! うん、気持ちいい!\"";
if !test_literal(
unicode,
Token::String(Data::String("Yo 👋! Ünícode µ works just fine 🚩! うん、気持ちいい!".to_string())),
unicode.chars().collect::<Vec<char>>().len(),
) { panic!() }
}
#[test]
fn comma() {
let source = Source::source("heck\\ man");
let tokens = lex(source.clone());
assert_eq!(tokens, Err(Syntax::error("Unexpected token", Span::new(&source, 4, 1))));
}
}