use base::ast::{is_operator_char, Comment, CommentType};
use base::pos::{self, BytePos, Column, Line, Location, Spanned};
use std::fmt;
use std::str::Chars;
use self::Error::*;
#[derive(Clone, PartialEq, Debug)]
pub enum Token<'input> {
ShebangLine(&'input str),
Identifier(&'input str),
Operator(&'input str),
StringLiteral(String),
CharLiteral(char),
IntLiteral(i64),
ByteLiteral(u8),
FloatLiteral(f64),
DocComment(Comment),
And,
Else,
If,
In,
Let,
Match,
Then,
Type,
With,
Colon,
Comma,
Dot,
Equals,
Lambda,
Pipe,
RArrow,
LBrace,
LBracket,
LParen,
RBrace,
RBracket,
RParen,
OpenBlock,
CloseBlock,
Semi,
EOF, }
impl<'input> fmt::Display for Token<'input> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
use self::Token::*;
let s = match *self {
ShebangLine(_) => "ShebangLine",
Identifier(_) => "Identifier",
Operator(_) => "Operator",
StringLiteral(_) => "StringLiteral",
CharLiteral(_) => "CharLiteral",
IntLiteral(_) => "IntLiteral",
ByteLiteral(_) => "ByteLiteral",
FloatLiteral(_) => "FloatLiteral",
DocComment { .. } => "DocComment",
And => "And",
Else => "Else",
If => "If",
In => "In",
Let => "Let",
Match => "Match",
Then => "Then",
Type => "Type",
With => "With",
LBrace => "LBrace",
LBracket => "LBracket",
LParen => "LParen",
RBrace => "RBrace",
RBracket => "RBracket",
RParen => "RParen",
Colon => "Colon",
Comma => "Comma",
Dot => "Dot",
Equals => "Equal",
Lambda => "Lambda",
Pipe => "Pipe",
RArrow => "RArrow",
OpenBlock => "OpenBlock",
CloseBlock => "CloseBlock",
Semi => "Semi",
EOF => "EOF",
};
s.fmt(f)
}
}
pub type SpannedToken<'input> = Spanned<Token<'input>, Location>;
pub type SpError = Spanned<Error, Location>;
quick_error! {
#[derive(Clone, Debug, PartialEq, Eq)]
pub enum Error {
EmptyCharLiteral {
description("empty char literal")
}
UnexpectedChar(ch: char) {
description("unexpected character")
}
UnexpectedEof {
description("unexpected end of file")
}
UnexpectedEscapeCode(ch: char) {
description("unexpected escape code")
}
UnterminatedCharLiteral {
description("unterminated character literal")
}
UnterminatedStringLiteral {
description("unterminated string literal")
}
NonParseableInt {
description("cannot parse integer, probable overflow")
}
}
}
fn error<T>(location: Location, code: Error) -> Result<T, SpError> {
Err(pos::spanned2(location, location, code))
}
fn is_ident_start(ch: char) -> bool {
match ch {
'_' | 'a'...'z' | 'A'...'Z' => true,
_ => false,
}
}
fn is_ident_continue(ch: char) -> bool {
match ch {
'0'...'9' | '\'' => true,
ch => is_ident_start(ch),
}
}
fn is_digit(ch: char) -> bool {
ch.is_digit(10)
}
struct CharLocations<'input> {
location: Location,
chars: Chars<'input>,
}
impl<'input> CharLocations<'input> {
pub fn new(input: &'input str) -> CharLocations<'input> {
CharLocations {
location: Location {
line: Line::from(0),
column: Column::from(1),
absolute: BytePos::from(0),
},
chars: input.chars(),
}
}
}
impl<'input> Iterator for CharLocations<'input> {
type Item = (Location, char);
fn next(&mut self) -> Option<(Location, char)> {
self.chars.next().map(|ch| {
let location = self.location;
self.location = self.location.shift(ch);
if self.location.column == Column::from(0) {
self.location.column = Column::from(1);
}
(location, ch)
})
}
}
pub struct Tokenizer<'input> {
input: &'input str,
chars: CharLocations<'input>,
eof_location: Location,
lookahead: Option<(Location, char)>,
}
impl<'input> Tokenizer<'input> {
pub fn new(input: &'input str) -> Tokenizer<'input> {
let mut chars = CharLocations::new(input);
let eof_location = chars.location;
Tokenizer {
input: input,
eof_location: eof_location,
lookahead: chars.next(),
chars: chars,
}
}
fn bump(&mut self) -> Option<(Location, char)> {
match self.lookahead {
Some((location, ch)) => {
self.eof_location = self.eof_location.shift(ch);
self.lookahead = self.chars.next();
Some((location, ch))
}
None => None,
}
}
fn skip_to_end(&mut self) {
while let Some(_) = self.bump() {}
}
fn error<T>(&mut self, location: Location, code: Error) -> Result<T, SpError> {
self.skip_to_end();
error(location, code)
}
fn eof_error<T>(&mut self) -> Result<T, SpError> {
let location = self.eof_location;
self.error(location, UnexpectedEof)
}
fn slice(&self, start: Location, end: Location) -> &'input str {
let start = start.absolute;
let end = end.absolute;
&self.input[start.to_usize()..end.to_usize()]
}
fn take_while<F>(&mut self, start: Location, mut keep_going: F) -> (Location, &'input str)
where
F: FnMut(char) -> bool,
{
self.take_until(start, |c| !keep_going(c))
}
fn take_until<F>(&mut self, start: Location, mut terminate: F) -> (Location, &'input str)
where
F: FnMut(char) -> bool,
{
while let Some((end, ch)) = self.lookahead {
if terminate(ch) {
return (end, self.slice(start, end));
} else {
self.bump();
}
}
(self.eof_location, self.slice(start, self.eof_location))
}
fn test_lookahead<F>(&self, mut test: F) -> bool
where
F: FnMut(char) -> bool,
{
self.lookahead.map_or(false, |(_, ch)| test(ch))
}
fn line_comment(&mut self, start: Location) -> Option<SpannedToken<'input>> {
let (end, comment) = self.take_until(start, |ch| ch == '\n');
if comment.starts_with("///") {
let skip = if comment.starts_with("/// ") { 4 } else { 3 };
let doc = Token::DocComment(Comment {
typ: CommentType::Line,
content: comment[skip..].to_string(),
});
Some(pos::spanned2(start, end, doc))
} else {
None
}
}
fn block_comment(&mut self, start: Location) -> Result<Option<SpannedToken<'input>>, SpError> {
self.bump();
loop {
let (_, comment) = self.take_until(start, |ch| ch == '*');
self.bump(); match self.lookahead {
Some((end, '/')) => {
self.bump();
if comment.starts_with("/**") && comment != "/**" {
let doc = Token::DocComment(Comment {
typ: CommentType::Block,
content: comment[3..].trim().to_string(),
});
return Ok(Some(pos::spanned2(start, end.shift('/'), doc)));
} else {
return Ok(None);
}
}
Some((_, _)) => continue,
None => return self.eof_error(),
}
}
}
fn operator(&mut self, start: Location) -> SpannedToken<'input> {
let (end, op) = self.take_while(start, is_operator_char);
let token = match op {
"." => Token::Dot,
":" => Token::Colon,
"=" => Token::Equals,
"|" => Token::Pipe,
"->" => Token::RArrow,
"#" => {
self.take_while(start, is_ident_start);
let (_, op) = self.take_while(start, is_operator_char);
Token::Operator(op)
}
op => Token::Operator(op),
};
pos::spanned2(start, end, token)
}
fn escape_code(&mut self) -> Result<char, SpError> {
match self.bump() {
Some((_, '\'')) => Ok('\''),
Some((_, '"')) => Ok('"'),
Some((_, '\\')) => Ok('\\'),
Some((_, '/')) => Ok('/'),
Some((_, 'n')) => Ok('\n'),
Some((_, 'r')) => Ok('\r'),
Some((_, 't')) => Ok('\t'),
Some((start, ch)) => self.error(start, UnexpectedEscapeCode(ch)),
None => self.eof_error(),
}
}
fn string_literal(&mut self, start: Location) -> Result<SpannedToken<'input>, SpError> {
let mut string = String::new();
while let Some((next, ch)) = self.bump() {
match ch {
'\\' => string.push(self.escape_code()?),
'"' => {
let end = next.shift(ch);
let token = Token::StringLiteral(string);
return Ok(pos::spanned2(start, end, token));
}
ch => string.push(ch),
}
}
self.error(start, UnterminatedStringLiteral)
}
fn shebang_line(&mut self, start: Location) -> Option<SpannedToken<'input>> {
let (end, line) = self.take_until(start, |ch| ch == '\n');
if line.starts_with("#!") {
let skip = 2;
let result = line[skip..].trim_right();
let tok = Token::ShebangLine(result);
Some(pos::spanned2(start, end, tok))
} else {
None
}
}
fn char_literal(&mut self, start: Location) -> Result<SpannedToken<'input>, SpError> {
let ch = match self.bump() {
Some((_, '\\')) => self.escape_code()?,
Some((_, '\'')) => return self.error(start, EmptyCharLiteral),
Some((_, ch)) => ch,
None => return self.eof_error(),
};
match self.bump() {
Some((end, '\'')) => Ok(pos::spanned2(
start,
end.shift('\''),
Token::CharLiteral(ch),
)),
Some((_, _)) => self.error(start, UnterminatedCharLiteral), None => self.eof_error(),
}
}
fn numeric_literal(&mut self, start: Location) -> Result<SpannedToken<'input>, SpError> {
let (end, int) = self.take_while(start, is_digit);
let (start, end, token) = match self.lookahead {
Some((_, '.')) => {
self.bump(); let (end, float) = self.take_while(start, is_digit);
(start, end, Token::FloatLiteral(float.parse().unwrap()))
}
Some((end, 'b')) => {
self.bump(); if let Ok(val) = int.parse() {
(start, end.shift('b'), Token::ByteLiteral(val))
} else {
return self.error(start, NonParseableInt);
}
}
Some((start, ch)) if is_ident_start(ch) => return self.error(start, UnexpectedChar(ch)),
None | Some(_) => if let Ok(val) = int.parse() {
(start, end, Token::IntLiteral(val))
} else {
return self.error(start, NonParseableInt);
},
};
Ok(pos::spanned2(start, end, token))
}
fn identifier(&mut self, start: Location) -> SpannedToken<'input> {
let (mut end, mut ident) = self.take_while(start, is_ident_continue);
match self.lookahead {
Some((_, c)) if c == '!' => {
self.bump();
end.column += 1.into();
end.absolute += 1.into();
ident = self.slice(start, end);
}
_ => (),
}
let token = match ident {
"and" => Token::And,
"else" => Token::Else,
"if" => Token::If,
"in" => Token::In,
"let" => Token::Let,
"match" => Token::Match,
"then" => Token::Then,
"type" => Token::Type,
"with" => Token::With,
src => Token::Identifier(src),
};
pos::spanned2(start, end, token)
}
}
impl<'input> Iterator for Tokenizer<'input> {
type Item = Result<SpannedToken<'input>, SpError>;
fn next(&mut self) -> Option<Result<SpannedToken<'input>, SpError>> {
while let Some((start, ch)) = self.bump() {
return match ch {
',' => Some(Ok(pos::spanned2(start, start.shift(ch), Token::Comma))),
'\\' => Some(Ok(pos::spanned2(start, start.shift(ch), Token::Lambda))),
'{' => Some(Ok(pos::spanned2(start, start.shift(ch), Token::LBrace))),
'[' => Some(Ok(pos::spanned2(start, start.shift(ch), Token::LBracket))),
'(' => Some(Ok(pos::spanned2(start, start.shift(ch), Token::LParen))),
'}' => Some(Ok(pos::spanned2(start, start.shift(ch), Token::RBrace))),
']' => Some(Ok(pos::spanned2(start, start.shift(ch), Token::RBracket))),
')' => Some(Ok(pos::spanned2(start, start.shift(ch), Token::RParen))),
'"' => Some(self.string_literal(start)),
'\'' => Some(self.char_literal(start)),
'/' if self.test_lookahead(|ch| ch == '/') => match self.line_comment(start) {
Some(token) => Some(Ok(token)),
None => continue,
},
'/' if self.test_lookahead(|ch| ch == '*') => match self.block_comment(start) {
Ok(Some(token)) => Some(Ok(token)),
Ok(None) => continue,
Err(err) => Some(Err(err)),
},
'#' if start.absolute == BytePos::from(0) &&
self.test_lookahead(|ch| ch == '!') =>
{
match self.shebang_line(start) {
Some(token) => Some(Ok(token)),
None => continue,
}
}
ch if is_ident_start(ch) => Some(Ok(self.identifier(start))),
ch if is_digit(ch) || (ch == '-' && self.test_lookahead(is_digit)) => {
Some(self.numeric_literal(start))
}
ch if is_operator_char(ch) => Some(Ok(self.operator(start))),
ch if ch.is_whitespace() => continue,
ch => Some(self.error(start, UnexpectedChar(ch))),
};
}
Some(Ok(pos::spanned2(
self.eof_location,
self.eof_location,
Token::EOF,
)))
}
}
#[cfg(test)]
mod test {
use base::ast::Comment;
use base::pos::{self, BytePos, Column, Line, Location, Spanned};
use super::*;
use super::{error, Tokenizer};
use token::Token;
use token::Token::*;
fn loc(byte: usize) -> Location {
Location {
line: Line::from(0),
column: Column::from(byte + 1),
absolute: BytePos::from(byte),
}
}
fn tokenizer<'input>(
input: &'input str,
) -> Box<Iterator<Item = Result<SpannedToken<'input>, SpError>> + 'input> where {
Box::new(Tokenizer::new(input).take_while(|token| match *token {
Ok(Spanned {
value: Token::EOF, ..
}) => false,
_ => true,
}))
}
fn test(input: &str, expected: Vec<(&str, Token)>) {
let mut tokenizer = tokenizer(input);
let mut count = 0;
let length = expected.len();
for (token, (expected_span, expected_tok)) in tokenizer.by_ref().zip(expected.into_iter()) {
count += 1;
println!("{:?}", token);
let start = loc(expected_span.find("~").unwrap());
let end = loc(expected_span.rfind("~").unwrap() + 1);
assert_eq!(Ok(pos::spanned2(start, end, expected_tok)), token);
}
assert_eq!(count, length);
assert_eq!(true, count > 0);
assert_eq!(None, tokenizer.next());
}
#[test]
fn sample_lambda_expr() {
test(
r#"(hi_, \a -> a ** a)"#,
vec![
(r#"~ "#, LParen),
(r#" ~~~ "#, Identifier("hi_")),
(r#" ~ "#, Comma),
(r#" ~ "#, Lambda),
(r#" ~ "#, Identifier("a")),
(r#" ~~ "#, RArrow),
(r#" ~ "#, Identifier("a")),
(r#" ~~ "#, Operator("**")),
(r#" ~ "#, Identifier("a")),
(r#" ~"#, RParen),
],
);
}
#[test]
fn sample_array() {
test(
r#"[1, a]"#,
vec![
(r#"~ "#, LBracket),
(r#" ~ "#, IntLiteral(1)),
(r#" ~ "#, Comma),
(r#" ~ "#, Identifier("a")),
(r#" ~"#, RBracket),
],
);
}
#[test]
fn builtin_operators() {
test(
r#". : = | ->"#,
vec![
(r#"~ "#, Dot),
(r#" ~ "#, Colon),
(r#" ~ "#, Equals),
(r#" ~ "#, Pipe),
(r#" ~~"#, RArrow),
],
);
}
#[test]
fn user_defined_operators() {
test(
r#"+-* * /&|=<>: .. <->"#,
vec![
(r#"~~~ "#, Operator("+-*")), (r#" ~ "#, Operator("*")),
(r#" ~~~~~~~ "#, Operator("/&|=<>:")), (r#" ~~ "#, Operator("..")),
(r#" ~~~"#, Operator("<->")),
],
);
}
#[test]
fn delimters() {
test(
r#"{][ () }] "#,
vec![
(r#"~ "#, LBrace),
(r#" ~ "#, RBracket),
(r#" ~ "#, LBracket),
(r#" ~ "#, LParen),
(r#" ~ "#, RParen),
(r#" ~ "#, RBrace),
(r#" ~ "#, RBracket),
],
);
}
#[test]
fn string_literals() {
test(
r#"foo "bar\"\n" baz "" "\t""#,
vec![
(r#"~~~ "#, Identifier("foo")),
(
r#" ~~~~~~~~~ "#,
StringLiteral("bar\"\n".to_string()),
),
(r#" ~~~ "#, Identifier("baz")),
(
r#" ~~ "#,
StringLiteral("".to_string()),
),
(
r#" ~~~~"#,
StringLiteral("\t".to_string()),
),
],
);
}
#[test]
fn string_literal_unexpected_escape_code() {
assert_eq!(
tokenizer(r#""\X""#).last(),
Some(error(loc(2), UnexpectedEscapeCode('X')))
);
}
#[test]
fn string_literal_unterminated() {
assert_eq!(
tokenizer(r#"foo "bar\"\n baz"#).last(),
Some(error(loc(4), UnterminatedStringLiteral))
);
}
#[test]
fn char_literals() {
test(
r#"foo 'b' '\\' '\''"#,
vec![
(r#"~~~ "#, Identifier("foo")),
(r#" ~~~ "#, CharLiteral('b')),
(r#" ~~~~ "#, CharLiteral('\\')),
(r#" ~~~~"#, CharLiteral('\'')),
],
);
}
#[test]
fn char_literal_empty() {
assert_eq!(
tokenizer(r#"foo ''"#).last(),
Some(error(loc(4), EmptyCharLiteral))
);
}
#[test]
fn char_literal_unexpected_escape_code() {
assert_eq!(
tokenizer(r#"'\X'"#).last(),
Some(error(loc(2), UnexpectedEscapeCode('X')))
);
}
#[test]
fn char_literal_unexpected_eof() {
assert_eq!(tokenizer(r#"'"#).last(), Some(error(loc(1), UnexpectedEof)));
assert_eq!(
tokenizer(r#" '"#).last(),
Some(error(loc(3), UnexpectedEof))
);
assert_eq!(
tokenizer(r#"'b"#).last(),
Some(error(loc(2), UnexpectedEof))
);
assert_eq!(
tokenizer(r#"'\\"#).last(),
Some(error(loc(3), UnexpectedEof))
);
assert_eq!(
tokenizer(r#"'\'"#).last(),
Some(error(loc(3), UnexpectedEof))
);
}
#[test]
fn char_literal_unterminated() {
assert_eq!(
tokenizer(r#"'frooble'"#).last(),
Some(error(loc(0), UnterminatedCharLiteral))
);
}
#[test]
fn int_literals() {
test(
r#"3 1036 45 -123"#,
vec![
(r#"~ "#, IntLiteral(3)),
(r#" ~~~~ "#, IntLiteral(1036)),
(r#" ~~ "#, IntLiteral(45)),
(r#" ~~~~"#, IntLiteral(-123)),
],
);
}
#[test]
fn int_literal_overflow() {
assert_eq!(
tokenizer(r#"12345678901234567890"#).last(),
Some(error(loc(0), NonParseableInt))
);
}
#[test]
fn byte_literals() {
test(
r#"3b 255b 45b"#,
vec![
(r#"~~ "#, ByteLiteral(3)),
(r#" ~~~~ "#, ByteLiteral(255)),
(r#" ~~~"#, ByteLiteral(45)),
],
);
}
#[test]
fn float_literals() {
test(
r#"03.1415 1036.2 -0.0"#,
vec![
(r#"~~~~~~~ "#, FloatLiteral(3.1415)),
(r#" ~~~~~~ "#, FloatLiteral(1036.2)),
(r#" ~~~~"#, FloatLiteral(-0.0)),
],
);
}
#[test]
fn line_comments() {
test(
r#"hi // hellooo"#,
vec![(r#"~~ "#, Identifier("hi"))],
);
}
#[test]
fn line_doc_comments() {
test(
r#"hi ///hellooo/// hi"#,
vec![
(r#"~~ "#, Identifier("hi")),
(
r#" ~~~~~~~~~~~~~~~~~"#,
DocComment(Comment {
typ: CommentType::Line,
content: "hellooo/// hi".to_string(),
}),
),
],
);
}
#[test]
fn line_doc_comments_with_space() {
test(
r#"hi /// hellooo/// hi"#,
vec![
(r#"~~ "#, Identifier("hi")),
(
r#" ~~~~~~~~~~~~~~~~~"#,
DocComment(Comment {
typ: CommentType::Line,
content: "hellooo/// hi".to_string(),
}),
),
],
);
}
#[test]
fn shebang_line_token_test() {
test(
"#!/bin/gluon\nhi /// hellooo/// hi",
vec![
(
r#"~~~~~~~~~~~~ "#,
ShebangLine("/bin/gluon"),
),
(r#" ~~ "#, Identifier("hi")),
(
r#" ~~~~~~~~~~~~~~~~~"#,
DocComment(Comment {
typ: CommentType::Line,
content: "hellooo/// hi".to_string(),
}),
),
],
);
}
}