use std::{collections::HashMap, convert::TryInto};
use num_bigint::BigUint;
use regex::{CaptureLocations, Regex};
use crate::{
error::{Error, ErrorKind},
source::Location,
token::{self, Token},
};
const TABSIZE: usize = 8;
pub type LexResult = Result<(Location, Token, Location), Error>;
struct ParseStringError {
offset: usize,
}
impl ParseStringError {
fn new(offset: usize) -> ParseStringError {
ParseStringError { offset }
}
fn offset(&self) -> usize {
self.offset
}
}
pub struct Lexer<'a> {
input: &'a str,
exhausted: bool,
location: Location,
indents: Vec<(usize, usize)>,
alt_col: usize,
pending: usize,
parens: isize,
newline: bool,
caps: CaptureLocations,
}
impl<'a> Iterator for Lexer<'a> {
type Item = LexResult;
fn next(&mut self) -> Option<Self::Item> {
if !self.exhausted {
Some(self.get_token())
} else {
None
}
}
}
lazy_static::lazy_static! {
static ref TOKEN_RE: Regex = {
fn escape_keys<V>(map: &HashMap<&str, V>) -> String {
let mut tokens = map.keys().collect::<Vec<_>>();
tokens.sort_unstable_by(|a, b| a.len().cmp(&b.len()).reverse());
tokens
.iter()
.map(|x| regex::escape(x))
.collect::<Vec<_>>()
.join("|")
}
let pat = format!(
r#"^(?x:
( # 1
[\ \t\f]+
| \\ (?: \r\n | [\r\n] )
| \# .*
)
| ( \r\n | [\r\n] ) # 2
| ( $ ) # 3
| ( # 4
(?:
(?: [0-9] (?: _? [0-9] )* )? \. [0-9] (?: _? [0-9] )*
| [0-9] (?: _? [0-9] )* \.?
) [eE] [+-]? [0-9] (?: _? [0-9] )*
| (?: [0-9] (?: _? [0-9] )* )? \. [0-9] (?: _? [0-9] )*
| [0-9] (?: _? [0-9] )* \.
) ( [jJ] )? # 5
| ( [0-9] (?: _? [0-9] )* [jJ] ) # 6
| ( # 7
(?: 0[bB] (?: _? [01] )+ )
| (?: 0[oO] (?: _? [0-7] )+ )
| (?: 0[xX] (?: _? [0-9a-fA-F] )+ )
| (?: [1-9] (?: _? [0-9] )* | 0+ (?: _? 0 )* )
)
| ( (?-i: rf | fr | rb | br | [rfbu] )? ) # 8
(?:
''' ( (?s: [^\\] | \\. )*? ) ''' # 9
| """ ( (?s: [^\\] | \\. )*? ) """ # 10
| ' ( (?s: [^\\\n'] | \\. )*? ) ' # 11
| " ( (?s: [^\\\n"] | \\. )*? ) " # 12
| ( ['"] | ''' | """ ) # 13
)
| ( {symbols} ) # 14
| ( {keywords} ) \b # 15
| ( [\p{{XID_Start}}_] \p{{XID_Continue}}* ) # 16
)"#,
symbols = escape_keys(&token::SYMBOLS),
keywords = escape_keys(&token::KEYWORDS),
);
Regex::new(&pat).unwrap()
};
static ref BYTES_RE: Regex = Regex::new(
r#"(?x) \\ (?:
( [\\'"abfnrtv] )
| ( [1-7]{1,3} )
| ( [0-9a-fA-F]{2} )
| \n
)"#).unwrap();
static ref STR_RE: Regex = Regex::new(
&[BYTES_RE.as_str(),
r#"| \\ (?:
N \{ (.*) \}
| u ( [0-9a-fA-F]{4} )
| U ( [0-9a-fA-F]{8} )
)"#].concat()).unwrap();
}
impl<'a> Lexer<'a> {
pub fn new(input: &str) -> Lexer {
Lexer {
input,
exhausted: false,
location: Location::new(1, 1),
indents: vec![(1, 1)],
alt_col: 1,
pending: 0,
parens: 0,
newline: true,
caps: TOKEN_RE.capture_locations(),
}
}
fn consume(&mut self, nbytes: usize) -> &str {
let (consumed, rest) = self.input.split_at(nbytes);
self.input = rest;
let mut bytes = consumed.bytes().peekable();
while let Some(c) = bytes.next() {
match c {
b'\r' => {
if bytes.peek() == Some(&&b'\n') {
continue;
} else {
self.location = Location::new(self.location.line + 1, 1);
self.alt_col = 1;
}
}
b'\n' => {
self.location = Location::new(self.location.line + 1, 1);
self.alt_col = 1;
}
b'\t' => {
self.location = Location::new(
self.location.line,
self.location.column + TABSIZE - ((self.location.column - 1) % TABSIZE),
);
self.alt_col += 1;
}
_ => {
self.location = Location::new(self.location.line, self.location.column + 1);
self.alt_col += 1;
}
}
}
consumed
}
fn matches(&self, nth: usize) -> bool {
self.caps.get(nth).is_some()
}
fn get_token(&mut self) -> LexResult {
if self.pending > 0 {
self.pending -= 1;
let loc = Location::new(self.location.line, 0);
return Ok((loc, Token::Dedent, loc));
}
if TOKEN_RE.captures_read(&mut self.caps, self.input).is_none() {
self.exhausted = true;
let c = self.input.chars().next().unwrap();
return Err(Error::new(ErrorKind::UnexpectedCharacter(c), self.location));
}
if self.newline && !(self.matches(1) || self.matches(2)) {
let col = self.location.column;
let (last, alt_last) = *self.indents.last().unwrap();
let emit_loc = Location::new(self.location.line, col);
if col == last {
if self.alt_col != alt_last {
self.alt_col = alt_last;
return Err(Error::new(ErrorKind::MixedTabsAndSpaces, emit_loc));
}
} else if col > last {
if self.alt_col <= alt_last {
self.alt_col = alt_last;
return Err(Error::new(ErrorKind::MixedTabsAndSpaces, emit_loc));
}
self.indents.push((col, self.alt_col));
return Ok((emit_loc, Token::Indent, emit_loc));
} else {
while self.indents.len() > 1 && col < self.indents.last().unwrap().0 {
self.pending += 1;
self.indents.pop();
}
let (last, alt_last) = *self.indents.last().unwrap();
if col != last {
self.exhausted = true;
return Err(Error::new(ErrorKind::UnmatchedDedent, emit_loc));
}
if self.alt_col != alt_last {
self.alt_col = alt_last;
return Err(Error::new(ErrorKind::MixedTabsAndSpaces, emit_loc));
}
return self.get_token();
}
}
let start = self.location;
let match_len = self.caps.get(0).unwrap().1;
let lex = &self.input[..match_len];
self.consume(match_len);
let end = self.location;
if self.matches(1) {
return self.get_token();
} else if self.matches(2) {
if !self.newline && self.parens == 0 {
self.newline = true;
return Ok((start, Token::Newline, start));
} else {
return self.get_token();
}
}
self.newline = false;
if self.matches(3) {
self.exhausted = true;
Ok((start, Token::Eof, start))
} else if self.matches(4) && !self.matches(5) {
Ok((start, Token::Float(Self::parse_float(lex).unwrap()), end))
} else if self.matches(5) || self.matches(6) {
let lex = &lex[..lex.len() - 1]; Ok((
start,
Token::Imaginary(Self::parse_float(lex).unwrap()),
end,
))
} else if self.matches(7) {
Ok((start, Token::Integer(Self::parse_int(lex).unwrap()), end))
} else if self.matches(8) {
if self.matches(13) {
self.exhausted = true;
return Err(Error::new(ErrorKind::UnterminatedString, start));
}
let prefix = self
.caps
.get(8)
.map(|(start, end)| lex[start..end].to_ascii_lowercase());
let raw = prefix.as_ref().map_or(false, |x| x.contains('r'));
let formatted = prefix.as_ref().map_or(false, |x| x.contains('f'));
let bytes = prefix.as_ref().map_or(false, |x| x.contains('b'));
let value = self
.caps
.get(9)
.or_else(|| self.caps.get(10))
.or_else(|| self.caps.get(11))
.or_else(|| self.caps.get(12))
.map(|(start, end)| &lex[start..end])
.unwrap();
if bytes {
let value = if !raw {
Self::parse_bytes(value).map_err(|e| {
Error::new(ErrorKind::NonAsciiBytes { offset: e.offset() }, start)
})?
} else {
value.as_bytes().to_vec()
};
Ok((start, Token::Bytes(value), end))
} else {
let value = if !raw {
Self::parse_str(value).map_err(|e| {
Error::new(ErrorKind::UnicodeDecode { offset: e.offset() }, start)
})?
} else {
value.to_string()
};
if formatted {
Ok((start, Token::FormattedString(value), end))
} else {
Ok((start, Token::String(value), end))
}
}
} else if self.matches(14) {
let tok = token::SYMBOLS.get(lex).unwrap();
match tok {
Token::ParenOpen | Token::BracketOpen | Token::BraceOpen => self.parens += 1,
Token::ParenClose | Token::BracketClose | Token::BraceClose => self.parens -= 1,
_ => (),
}
Ok((start, tok.clone(), end))
} else if self.matches(15) {
let tok = token::KEYWORDS.get(lex).unwrap();
Ok((start, tok.clone(), end))
} else if self.matches(16) {
Ok((start, Token::Name(lex.into()), end))
} else {
unreachable!()
}
}
fn parse_bytes(mut s: &str) -> Result<Vec<u8>, ParseStringError> {
if let Some(offset) = s.bytes().position(|c| !c.is_ascii()) {
return Err(ParseStringError::new(offset));
}
let mut result = vec![];
let mut caps = BYTES_RE.capture_locations();
while let Some(cap) = BYTES_RE.captures_read(&mut caps, s) {
result.extend(s[..cap.start()].bytes());
s = &s[cap.end()..];
let escape = &cap.as_str()[1..];
let value = if caps.get(1).is_some() {
Self::ascii_escape(escape.as_bytes()[0])
} else if caps.get(2).is_some() {
u8::from_str_radix(escape, 8).unwrap()
} else if caps.get(3).is_some() {
u8::from_str_radix(escape, 16).unwrap()
} else {
continue;
};
result.push(value);
}
result.extend(s.bytes());
Ok(result)
}
fn parse_str(mut s: &str) -> Result<String, ParseStringError> {
let mut result = String::new();
let mut caps = STR_RE.capture_locations();
let mut offset = 0;
while let Some(cap) = STR_RE.captures_read(&mut caps, s) {
result.push_str(&s[..cap.start()]);
s = &s[cap.end()..];
let escape = &cap.as_str()[1..];
let value = if caps.get(1).is_some() {
Self::ascii_escape(escape.as_bytes()[0]) as char
} else if caps.get(2).is_some() {
u8::from_str_radix(escape, 8).unwrap() as char
} else if caps.get(3).is_some() {
u8::from_str_radix(escape, 16).unwrap() as char
} else if caps.get(4).is_some() {
unicode_names2::character(&escape[2..escape.len() - 1])
.ok_or_else(|| ParseStringError::new(offset))?
} else if caps.get(5).is_some() || caps.get(6).is_some() {
u32::from_str_radix(&escape[1..], 16)
.unwrap()
.try_into()
.map_err(|_| ParseStringError::new(offset))?
} else {
offset += 2;
continue;
};
result.push(value);
offset += cap.end();
}
result.push_str(s);
Ok(result)
}
fn ascii_escape(c: u8) -> u8 {
match c {
b'n' => b'\n',
b'r' => b'\r',
b't' => b'\t',
b'a' => b'\x07',
b'b' => b'\x08',
b'f' => b'\x0c',
b'v' => b'\x0b',
c => c,
}
}
fn parse_int(s: &str) -> Option<BigUint> {
let mut b = s.as_bytes();
let radix = match b.get(1) {
Some(b'b') | Some(b'B') => 2,
Some(b'o') | Some(b'O') => 8,
Some(b'x') | Some(b'X') => 16,
_ => 10,
};
if radix != 10 {
b = &b[2..];
}
if b[0] == b'_' {
b = &b[1..];
}
BigUint::parse_bytes(b, radix)
}
fn parse_float(s: &str) -> Option<f64> {
if !s.contains('_') {
s.parse().ok()
} else {
s.replace('_', "").parse().ok()
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::token::Token::*;
fn lex(i: &str) -> Vec<Result<Token, ErrorKind>> {
Lexer::new(i)
.map(|x| x.map(|x| x.1).map_err(|e| e.kind))
.collect()
}
macro_rules! assert_lex {
($l:expr, $r:expr) => {{
assert_eq!(lex($l), vec![Ok($r), Ok(Eof)])
}};
}
#[test]
fn test_lex_indent() {
assert_eq!(
lex("a\n b\nc"),
vec![
Ok(Name("a".into())),
Ok(Newline),
Ok(Indent),
Ok(Name("b".into())),
Ok(Newline),
Ok(Dedent),
Ok(Name("c".into())),
Ok(Eof)
]
);
assert_eq!(
lex(" a\n b"),
vec![
Ok(Indent),
Ok(Name("a".into())),
Ok(Newline),
Err(ErrorKind::UnmatchedDedent),
]
);
assert_eq!(
lex(" \ta\n\tb"),
vec![
Ok(Indent),
Ok(Name("a".into())),
Ok(Newline),
Err(ErrorKind::MixedTabsAndSpaces),
Ok(Name("b".into())),
Ok(Eof)
]
);
}
#[test]
fn test_lex_newline() {
assert_eq!(
lex("a\n\r\nb\r"),
vec![
Ok(Name("a".into())),
Ok(Newline),
Ok(Name("b".into())),
Ok(Newline),
Ok(Eof),
]
);
}
#[test]
fn test_lex_integer() {
assert_lex!("1234567890", Integer(1234567890u64.into()));
assert_lex!("0b10", Integer(0b10u64.into()));
assert_lex!("0o12345670", Integer(0o12345670u64.into()));
assert_lex!("0x123456789abcdef0", Integer(0x123456789abcdef0u64.into()));
assert_lex!("1_2_3_4_5", Integer(12345u64.into()));
assert_lex!("0x_1_2_3", Integer(0x1_2_3u64.into()));
assert_lex!("000", Integer(0u64.into()));
}
#[test]
fn test_lex_float() {
assert_lex!("3.14", Float(3.14));
assert_lex!("10.", Float(10.));
assert_lex!(".001", Float(0.001));
assert_lex!("1e100", Float(1e100));
assert_lex!("3.14e-10", Float(3.14e-10));
assert_lex!("0e0", Float(0e0));
assert_lex!("3.14_15_93", Float(3.14_15_93));
assert_lex!("0.1e+10", Float(0.1e+10));
assert_lex!("1.e1", Float(1.0e1));
}
#[test]
fn lex_string() {
assert_lex!("'foo'", String("foo".into()));
assert_lex!("'foo\\tbar'", String("foo\tbar".into()));
assert_lex!("'\\x'", String("\\x".into()));
assert_lex!("'\\N{SNOWMAN}'", String("☃".into()));
assert_lex!("b'\\N{SNOWMAN}'", Bytes(b"\\N{SNOWMAN}".to_vec()));
assert_lex!("'\\\\\\'\"'", String("\\'\"".into()));
assert_lex!("'a \\\nb'", String("a b".into()));
assert_lex!("'''a \nb\\\nc'''", String("a \nbc".into()));
assert_lex!("r'\\t\\\n\\''", String("\\t\\\n\\'".into()))
}
}