use core::RpNumber;
use errors::{Error, Result};
use num_bigint::BigInt;
use num_traits::Zero;
use std::borrow::Cow;
use std::result;
use std::str::CharIndices;
use token::Token;
pub struct Lexer<'input> {
source: CharIndices<'input>,
source_len: usize,
source_str: &'input str,
n0: Option<(usize, char)>,
n1: Option<(usize, char)>,
n2: Option<(usize, char)>,
buffer: String,
code_block: Option<(usize, usize)>,
code_close: Option<(usize, usize)>,
}
pub fn match_keyword(content: &str) -> Option<Token> {
let token = match content {
"any" => Token::AnyKeyword,
"interface" => Token::InterfaceKeyword,
"type" => Token::TypeKeyword,
"enum" => Token::EnumKeyword,
"tuple" => Token::TupleKeyword,
"service" => Token::ServiceKeyword,
"use" => Token::UseKeyword,
"as" => Token::AsKeyword,
"float" => Token::FloatKeyword,
"double" => Token::DoubleKeyword,
"i32" => Token::Signed32,
"i64" => Token::Signed64,
"u32" => Token::Unsigned32,
"u64" => Token::Unsigned64,
"boolean" => Token::BooleanKeyword,
"string" => Token::StringKeyword,
"datetime" => Token::DateTimeKeyword,
"bytes" => Token::BytesKeyword,
"stream" => Token::StreamKeyword,
_ => return None,
};
Some(token)
}
impl<'input> Lexer<'input> {
#[inline]
fn step(&mut self) {
self.n0 = self.n1;
self.n1 = self.n2;
self.n2 = self.source.next();
}
#[inline]
fn step_n(&mut self, n: usize) -> usize {
for _ in 0..n {
self.step();
}
self.n0
.map(|n| n.0)
.unwrap_or_else(|| self.source_str.len())
}
#[inline]
fn one(&mut self) -> Option<(usize, char)> {
self.n0
}
#[inline]
fn two(&mut self) -> Option<(usize, char, char)> {
if let (Some((pos, a)), Some((_, b))) = (self.n0, self.n1) {
Some((pos, a, b))
} else {
None
}
}
#[inline]
fn three(&mut self) -> Option<(usize, char, char, char)> {
if let (Some((pos, a)), Some((_, b)), Some((_, c))) = (self.n0, self.n1, self.n2) {
Some((pos, a, b, c))
} else {
None
}
}
#[inline]
fn pos(&self) -> usize {
self.n0.map(|n| n.0).unwrap_or(self.source_len)
}
fn identifier(&mut self, start: usize) -> Result<(usize, Token<'input>, usize)> {
let (stripped, _) = take!(self, start, '_');
let (end, content) = take!(self, stripped, 'a'...'z' | '_' | '0'...'9');
if stripped != start {
return Ok((start, Token::Identifier(content.into()), end));
}
let token = match match_keyword(content) {
Some(token) => token,
None => {
return Ok((start, Token::Identifier(content.into()), end));
}
};
return Ok((start, token, end));
}
fn type_identifier(&mut self, start: usize) -> Result<(usize, Token<'input>, usize)> {
let (end, content) = take!(self, start, 'A'...'Z' | 'a'...'z' | '0'...'9');
Ok((start, Token::TypeIdentifier(content.into()), end))
}
fn parse_fraction(input: &str) -> result::Result<(usize, BigInt), &'static str> {
let dec = input
.chars()
.enumerate()
.find(|&(_, ref c)| *c != '0')
.map(|(i, _)| i)
.unwrap_or(0usize);
let fraction: BigInt = input.parse().map_err(|_| "illegal fraction")?;
Ok((dec, fraction))
}
fn apply_fraction(digits: &mut BigInt, decimal: &mut usize, dec: usize, fraction: BigInt) {
*decimal += dec;
let mut f = fraction.clone();
let ten: BigInt = 10.into();
while !f.is_zero() {
*digits = digits.clone() * ten.clone();
*decimal += 1;
f = f / ten.clone();
}
*digits = digits.clone() + fraction;
}
fn apply_exponent(digits: &mut BigInt, decimal: &mut usize, exponent: i32) {
if exponent < 0 {
*decimal += exponent.abs() as usize;
return;
}
let ten: BigInt = 10.into();
for _ in 0..exponent {
if *decimal > 0 {
*decimal = *decimal - 1;
} else {
*digits = digits.clone() * ten.clone();
}
}
}
fn number(&mut self, start: usize) -> Result<(usize, Token<'input>, usize)> {
let (end, number) = self.parse_number(start).map_err(|(message, offset)| {
Error::InvalidNumber {
message: message,
pos: start + offset,
}
})?;
Ok((start, Token::Number(number), end))
}
fn parse_number(
&mut self,
start: usize,
) -> result::Result<(usize, RpNumber), (&'static str, usize)> {
let (negative, offset) = if let Some((_, '-')) = self.one() {
(true, self.step_n(1))
} else {
(false, start)
};
let (mut end, mut digits) = {
let (end, whole) = take!(self, offset, '0'...'9');
(
end,
whole
.parse::<BigInt>()
.map_err(|_| ("illegal number", end))?,
)
};
let mut decimal = 0usize;
if let Some((_, '.')) = self.one() {
let offset = self.step_n(1);
{
let (e, fraction) = take!(self, offset, '0'...'9');
end = e;
let (dec, fraction) = Self::parse_fraction(fraction).map_err(|e| (e, end))?;
Self::apply_fraction(&mut digits, &mut decimal, dec, fraction);
}
if let Some((_, 'e')) = self.one() {
let offset = self.step_n(1);
let (e, content) = take!(self, offset, '-' | '0'...'9');
end = e;
let exponent: i32 = content.parse().map_err(|_| ("illegal exponent", end))?;
Self::apply_exponent(&mut digits, &mut decimal, exponent);
}
}
let digits = if negative { -digits } else { digits };
let number = RpNumber {
digits: digits,
decimal: decimal,
};
Ok((end, number))
}
fn decode_unicode4(&mut self) -> result::Result<char, (&'static str, usize)> {
let mut res = 0u32;
for x in 0..4u32 {
let c = self.one()
.ok_or_else(|| ("expected digit", x as usize))?
.1
.to_string();
let c = u32::from_str_radix(&c, 16).map_err(|_| ("expected hex digit", x as usize))?;
res += c << (4 * (3 - x));
self.step();
}
Ok(::std::char::from_u32(res).ok_or_else(|| ("invalid character", 0usize))?)
}
fn escape(&mut self, pos: usize) -> Result<char> {
self.step();
let (_, escape) = self.one()
.ok_or_else(|| Error::UnterminatedEscape { start: self.pos() })?;
let escaped = match escape {
'n' => '\n',
'r' => '\r',
't' => '\t',
'u' => {
let seq_start = self.step_n(1);
let c = self.decode_unicode4()
.map_err(|(message, offset)| Error::InvalidEscape {
message: message,
pos: seq_start + offset,
})?;
return Ok(c);
}
_ => {
return Err(Error::InvalidEscape {
message: "unrecognized escape, should be one of: \\n, \\r, \\t, or \\uXXXX",
pos: pos,
}.into());
}
};
self.step();
return Ok(escaped);
}
fn string(&mut self, start: usize) -> Result<(usize, Token<'input>, usize)> {
self.buffer.clear();
self.step();
while let Some((pos, c)) = self.one() {
if c == '\\' {
let c = self.escape(pos)?;
self.buffer.push(c);
continue;
}
if c == '"' {
let end = self.step_n(1);
return Ok((start, Token::String(self.buffer.clone()), end));
}
self.buffer.push(c);
self.step();
}
Err(Error::UnterminatedString { start: start }.into())
}
fn code_block(
&mut self,
code_start: usize,
start: usize,
) -> Result<(usize, Token<'input>, usize)> {
while let Some((end, a, b)) = self.two() {
if ('}', '}') == (a, b) {
let code_end = self.step_n(2);
let out = &self.source_str[start..end];
self.code_block = None;
self.code_close = Some((end, code_end));
return Ok((code_start, Token::CodeContent(out.into()), code_end));
}
self.step();
}
Err(Error::UnterminatedCodeBlock { start: start }.into())
}
fn package_doc_comments(&mut self, start: usize) -> Result<(usize, Token<'input>, usize)> {
let mut comment: Vec<Cow<'input, str>> = Vec::new();
loop {
let (end, _) = take!(self, start, ' ' | '\n' | '\r' | '\t');
if let Some((_, '/', '/', '!')) = self.three() {
let start = self.step_n(3);
let (_, content) = take_until!(self, start, '\n' | '\r');
comment.push(content.into());
} else {
return Ok((start, Token::PackageDocComment(comment), end));
}
}
}
fn doc_comments(&mut self, start: usize) -> Result<(usize, Token<'input>, usize)> {
let mut comment: Vec<Cow<'input, str>> = Vec::new();
loop {
let (end, _) = take!(self, start, ' ' | '\n' | '\r' | '\t');
if let Some((_, '/', '/', '/')) = self.three() {
let start = self.step_n(3);
let (_, content) = take_until!(self, start, '\n' | '\r');
comment.push(content.into());
} else {
return Ok((start, Token::DocComment(comment), end));
}
}
}
fn line_comment(&mut self) {
let start = self.step_n(2);
let _ = take_until!(self, start, '\n' | '\r');
}
fn block_comment(&mut self) {
self.step_n(2);
while let Some((_, a, b)) = self.two() {
if ('*', '/') == (a, b) {
self.step();
self.step();
break;
}
self.step();
}
}
fn normal_mode_next(&mut self) -> Option<Result<(usize, Token<'input>, usize)>> {
if let Some((start, end)) = self.code_close {
self.code_close = None;
return Some(Ok((start, Token::CodeClose, end)));
}
if let Some((code_start, start)) = self.code_block {
return Some(self.code_block(code_start, start));
}
loop {
if let Some((start, '/', '/', '!')) = self.three() {
return Some(self.package_doc_comments(start));
}
if let Some((start, '/', '/', '/')) = self.three() {
return Some(self.doc_comments(start));
}
if let Some((start, a, b)) = self.two() {
let token = match (a, b) {
('/', '/') => {
self.line_comment();
continue;
}
('/', '*') => {
self.block_comment();
continue;
}
('{', '{') => {
let end = self.step_n(2);
self.code_block = Some((start, end));
return Some(Ok((start, Token::CodeOpen, end)));
}
(':', ':') => Some(Token::Scope),
('-', '>') => Some(Token::RightArrow),
_ => None,
};
if let Some(token) = token {
let end = self.step_n(2);
return Some(Ok((start, token, end)));
}
}
if let Some((start, c)) = self.one() {
let token = match c {
'{' => Token::LeftCurly,
'}' => Token::RightCurly,
'[' => Token::LeftBracket,
']' => Token::RightBracket,
'(' => Token::LeftParen,
')' => Token::RightParen,
';' => Token::SemiColon,
':' => Token::Colon,
',' => Token::Comma,
'.' => Token::Dot,
'?' => Token::QuestionMark,
'#' => Token::Hash,
'!' => Token::Bang,
'=' => Token::Equal,
'_' | 'a'...'z' => return Some(self.identifier(start)),
'A'...'Z' => return Some(self.type_identifier(start)),
'"' => return Some(self.string(start)),
'-' | '0'...'9' => return Some(self.number(start)),
' ' | '\n' | '\r' | '\t' => {
self.step();
continue;
}
_ => break,
};
let end = self.step_n(1);
return Some(Ok((start, token, end)));
} else {
return None;
}
}
Some(Err(Error::Unexpected { pos: self.pos() }))
}
}
impl<'input> Iterator for Lexer<'input> {
type Item = Result<(usize, Token<'input>, usize)>;
fn next(&mut self) -> Option<Self::Item> {
self.normal_mode_next()
}
}
pub fn lex(input: &str) -> Lexer {
let mut source = input.char_indices();
let n0 = source.next();
let n1 = source.next();
let n2 = source.next();
Lexer {
source: source,
source_len: input.len(),
source_str: input,
n0: n0,
n1: n1,
n2: n2,
buffer: String::new(),
code_block: None,
code_close: None,
}
}
#[cfg(test)]
pub mod tests {
use super::Token::*;
use super::*;
fn tokenize(input: &str) -> Result<Vec<(usize, Token, usize)>> {
lex(input).collect()
}
#[test]
pub fn test_lexer() {
let expected = vec![
(0, Identifier("hello".into()), 5),
(6, TypeIdentifier("World".into()), 11),
(12, LeftCurly, 13),
(14, UseKeyword, 17),
(18, AsKeyword, 20),
(21, RightCurly, 22),
(23, String("hello world".into()), 36),
];
assert_eq!(
expected,
tokenize("hello World { use as } \"hello world\"").unwrap()
);
}
#[test]
pub fn test_code_block() {
let expected = vec![
(0, CodeOpen, 2),
(0, CodeContent(" foo bar baz \n zing ".into()), 24),
(22, CodeClose, 24),
];
assert_eq!(expected, tokenize("{{ foo bar baz \n zing }}").unwrap());
}
#[test]
pub fn test_complex_number() {
let expected = vec![
(
0,
Number(RpNumber {
digits: (-1242).into(),
decimal: 6,
}),
9,
),
];
assert_eq!(expected, tokenize("-12.42e-4").unwrap());
}
#[test]
pub fn test_number_2() {
assert_eq!(vec![(0, Number(12.into()), 2)], tokenize("12").unwrap());
}
#[test]
pub fn test_name() {
let expected = vec![
(0, Identifier("foo".into()), 3),
(3, Scope, 5),
(5, TypeIdentifier("Bar".into()), 8),
(8, Dot, 9),
(9, TypeIdentifier("Baz".into()), 12),
];
assert_eq!(expected, tokenize("foo::Bar.Baz").unwrap());
}
#[test]
pub fn test_strings() {
let expected = vec![(0, String("foo\nbar".to_owned()), 10)];
assert_eq!(expected, tokenize("\"foo\\nbar\"").unwrap());
}
#[test]
pub fn test_instance() {
let expected = vec![
(0, Identifier("foo".into()), 3),
(3, Scope, 5),
(5, TypeIdentifier("Bar".into()), 8),
(8, Dot, 9),
(9, TypeIdentifier("Baz".into()), 12),
(12, LeftParen, 13),
(13, Identifier("hello".into()), 18),
(18, Colon, 19),
(20, Number(12.into()), 22),
(22, RightParen, 23),
];
assert_eq!(expected, tokenize("foo::Bar.Baz(hello: 12)").unwrap());
}
#[test]
pub fn test_comments() {
let tokens = tokenize("// hello \n world");
assert_eq!(vec![(11, Identifier("world".into()), 16)], tokens.unwrap());
let tokens = tokenize("he/* this is a comment */llo");
assert_eq!(
vec![
(0, Identifier("he".into()), 2),
(25, Identifier("llo".into()), 28),
],
tokens.unwrap()
);
let tokens = tokenize("// test\n// this\nhello");
assert_eq!(vec![(16, Identifier("hello".into()), 21)], tokens.unwrap());
}
#[test]
pub fn test_identifier_stripping() {
let a = &tokenize("my_version").unwrap()[0].1;
let b = &tokenize("_my_version").unwrap()[0].1;
let c = &tokenize("__my_version").unwrap()[0].1;
assert_eq!(a, b);
assert_eq!(a, c);
}
#[test]
pub fn test_doc_comment() {
let tokens = tokenize("/// foo\n\r /// bar \r\n /// baz ").unwrap();
let reference = [
(
0,
DocComment(vec![" foo".into(), " bar ".into(), " baz ".into()]),
38,
),
];
assert_eq!(reference, &tokens[..]);
}
}