use super::error::GResult;
pub(crate) struct Lexer {
pub(crate) line_number: usize,
pub(crate) str_status: StrStatus
}
impl Lexer {
pub(crate) fn new() -> Lexer {
Lexer {
line_number: 1,
str_status: StrStatus::OutsideStr
}
}
pub(crate) fn lex<'a>(&mut self, text: &mut &'a str) -> GResult<Tok<'a>> {
lex(self, text)
}
}
pub(crate) struct Tok<'a> {
pub(crate) tok_type: TokType,
pub(crate) text: &'a str
}
#[derive(Copy, Clone, Debug, PartialEq)]
pub(crate) enum TokType {
Whitespace,
NumOrSym,
True,
False,
Nil,
FormComment,
Char,
SpaceChar,
TabChar,
NewlineChar,
ReturnChar,
NulChar,
AsciiChar,
UnicodeChar,
ArrOpen,
TabOpen,
ArrClose,
AccessOpen,
AccessClose,
StrOpen, StrClose, StrPause, StrResume, StrChars,
RawStrOpen, RawStrClose, RawStrChars,
Quote,
Backquote,
Unquote,
Splay,
MethName,
Atsign
}
#[derive(Copy, Clone, PartialEq)]
pub(crate) enum StrStatus {
OutsideStr,
InsideStr,
InsideRawStr(usize), InsideBlockComment(usize) }
fn lex<'a>(lexer: &mut Lexer, text: &mut &'a str) -> GResult<Tok<'a>> {
assert!(text.len() > 0, "empty string passed to lex()");
let mut chars = text.chars();
let first = chars.next().unwrap();
let (tok_type, len) = match lexer.str_status {
StrStatus::InsideStr => {
let second = chars.clone().next();
match first {
'"' => {
lexer.str_status = StrStatus::OutsideStr;
(TokType::StrClose, 1)
}
'{' if second != Some('{') => {
lexer.str_status = StrStatus::OutsideStr;
(TokType::StrPause, 1)
}
'}' if second != Some('}') => bail!("unexpected }} in str literal"),
_ => {
let mut len = 0;
let mut input = &text[..];
while input.len() > 0 {
if input.starts_with("\\\\") ||
input.starts_with("\\\"") ||
input.starts_with("{{") ||
input.starts_with("}}") {
len += 2;
input = &input[2..];
}
else if input.starts_with('"') ||
input.starts_with('{') ||
input.starts_with('}') {
break
}
else {
let ch = input.chars().next().unwrap();
let ch_len = ch.len_utf8();
len += ch_len;
input = &text[len..];
}
}
(TokType::StrChars, len)
}
}
}
StrStatus::InsideRawStr(num_hashes) => {
fn starts_with_delimiter(st: &str, num_hashes: usize) -> bool {
assert!(st.len() > 0);
let mut chars = st.chars();
let first = chars.next().unwrap();
if first == '"' {
for _ in 0 .. num_hashes {
if chars.next() != Some('#') {
return false
}
}
true
} else {
false
}
}
if starts_with_delimiter(text, num_hashes) {
lexer.str_status = StrStatus::OutsideStr;
(TokType::RawStrClose, num_hashes + 1)
} else {
let mut len = first.len_utf8();
let mut st = chars.as_str();
while let Some(ch) = chars.next() {
if starts_with_delimiter(st, num_hashes) {
break
} else {
len += ch.len_utf8();
}
st = chars.as_str();
}
(TokType::RawStrChars, len)
}
}
StrStatus::InsideBlockComment(mut nesting) => {
let mut len = 0;
let mut input = *text;
while input.len() > 0 {
if input.starts_with("#|") {
len += 2;
input = &input[2..];
nesting += 1;
lexer.str_status = StrStatus::InsideBlockComment(nesting);
}
else if input.starts_with("|#") {
len += 2;
input = &input[2..];
if nesting == 0 {
lexer.str_status = StrStatus::OutsideStr;
break
} else {
nesting -= 1;
lexer.str_status = StrStatus::InsideBlockComment(nesting);
}
}
else {
let mut chars = input.chars();
len += chars.next().unwrap().len_utf8();
input = chars.as_str();
}
}
(TokType::Whitespace, len)
}
StrStatus::OutsideStr => {
let second = chars.clone().next();
match (first, second) {
(ch, _) if char_is_whitespace(ch) => {
let mut len = 1;
while let Some(ch) = chars.next() {
if !char_is_whitespace(ch) {
break
}
len += ch.len_utf8();
}
(TokType::Whitespace, len)
}
(';', _) => {
let mut len = 1;
while let Some(ch) = chars.next() {
len += ch.len_utf8();
if ch == '\n' {
break
}
}
(TokType::Whitespace, len)
}
('\\', _) => {
let rest = chars.as_str();
if rest.starts_with("space") {
(TokType::SpaceChar, 6)
} else if rest.starts_with("tab") {
(TokType::TabChar, 4)
} else if rest.starts_with("newline") {
(TokType::NewlineChar, 8)
} else if rest.starts_with("return") {
(TokType::ReturnChar, 7)
} else if rest.starts_with("nul") {
(TokType::NulChar, 4)
} else if rest.starts_with("u{") {
chars.next().unwrap();
chars.next().unwrap();
let mut digits = 0;
loop {
match chars.next() {
Some('}') => break,
Some(ch) if ch.is_digit(16) => digits += 1,
_ => bail!("malformed unicode char escape")
}
}
(TokType::UnicodeChar, digits + 4)
} else {
if let Some(second) = second {
chars.next().unwrap();
match (second, chars.next(), chars.next()) {
('x', Some(hi), Some(lo)) if hi.is_digit(8) && lo.is_digit(16) => {
(TokType::AsciiChar, 4)
}
_ => (TokType::Char, 1 + second.len_utf8())
}
} else {
bail!("input ends with \\ character")
}
}
}
('#', Some('t')) => (TokType::True, 2),
('#', Some('f')) => (TokType::False, 2),
('#', Some('n')) => (TokType::Nil, 2),
('#', Some(';')) => (TokType::FormComment, 2),
('#', Some('|')) => {
lexer.str_status = StrStatus::InsideBlockComment(0);
(TokType::Whitespace, 2)
}
('(', _) => (TokType::ArrOpen, 1),
('#', Some('(')) => (TokType::TabOpen, 2),
(')', _) => (TokType::ArrClose, 1),
('[', _) => (TokType::AccessOpen, 1),
(']', _) => (TokType::AccessClose, 1),
('"', _) => {
lexer.str_status = StrStatus::InsideStr;
(TokType::StrOpen, 1)
}
('}', _) => {
lexer.str_status = StrStatus::InsideStr;
(TokType::StrResume, 1)
}
('{', _) => bail!("unexpected {{"),
('r', Some('"')) | ('r', Some('#')) => {
let mut num_hashes = 0;
loop {
match chars.next() {
Some('"') => break,
Some('#') => num_hashes += 1,
_ => bail!("malformed raw str")
}
}
lexer.str_status = StrStatus::InsideRawStr(num_hashes);
(TokType::RawStrOpen, 2 + num_hashes)
}
('\'', _) => (TokType::Quote, 1),
('`', _) => (TokType::Backquote, 1),
('~', _) => (TokType::Unquote, 1),
('.', Some('.')) => (TokType::Splay, 2),
('.', _) => (TokType::MethName, 1),
('@', _) => (TokType::Atsign, 1),
(first, _) if is_valid_sym_char(first) => {
let mut len = first.len_utf8();
while let Some(ch) = chars.next() {
if ch == '#' {
len += 1;
break
}
if !is_valid_sym_char(ch) {
break
}
len += ch.len_utf8();
}
(TokType::NumOrSym, len)
}
_ => bail!("unexpected character '{}'", first)
}
}
};
let (tok_text, rest) = text.split_at(len);
lexer.line_number += tok_text.bytes().filter(|byte| *byte == b'\n').count();
*text = rest;
Ok(Tok {
tok_type,
text: tok_text
})
}
pub(crate) fn is_valid_sym_char(ch: char) -> bool {
match ch {
'A' ..= 'Z' | 'a' ..= 'z' | '0' ..= '9' | '!' | '$' | '%' | '&' | '*' | '+' | '-' | '.' |
'/' | ':' | '<' | '=' | '>' | '?' | '^' | '_' | '~' => true,
_ => false
}
}
pub(crate) fn char_is_whitespace(ch: char) -> bool {
match ch {
'\t' | '\n' | '\u{0B}' | '\u{0C}' | '\r' | ' ' | ',' |
'\u{85}' | '\u{200E}' | '\u{200F}' | '\u{2028}' | '\u{2029}' => true,
_ => false
}
}