use crate::token::Keyword;
use crate::token::LitKind;
use crate::token::Operator;
use crate::token::Token;
use crate::Error;
use crate::Result;
use std::fs;
use std::path::Path;
use std::path::PathBuf;
use std::str::FromStr;
use unic_ucd_category::GeneralCategory;
#[derive(Default)]
pub struct Scanner {
pos: usize,
index: usize,
semicolon: bool,
source: String,
lines: Vec<usize>,
path: Option<PathBuf>,
}
impl Scanner {
pub(crate) fn from<S: AsRef<str>>(s: S) -> Self {
Self {
source: s.as_ref().to_string(),
..Default::default()
}
}
pub(crate) fn from_file<P: AsRef<Path>>(path: P) -> Result<Self> {
const BOM: &str = "\u{feff}";
let mut source = fs::read_to_string(&path)?;
if source.starts_with(BOM) {
source = source.split_off(3);
}
Ok(Self {
source,
path: Some(path.as_ref().into()),
..Default::default()
})
}
pub(crate) fn path(&self) -> Option<PathBuf> {
self.path.clone()
}
pub(crate) fn position(&self) -> usize {
self.pos
}
pub(crate) fn preback(&self) -> (usize, usize, bool) {
(self.pos, self.index, self.semicolon)
}
pub(crate) fn goback(&mut self, pre: (usize, usize, bool)) {
self.pos = pre.0;
self.index = pre.1;
self.semicolon = pre.2
}
pub(crate) fn line_info(&self, pos: usize) -> (usize, usize) {
self.lines
.iter()
.enumerate()
.take_while(|(_, &start)| pos >= start)
.last()
.map(|(index, &start)| (index + 1, pos - start))
.unwrap_or((1, pos))
}
fn add_line(&mut self, line_start: usize) {
self.lines.push(line_start);
}
fn error<S: AsRef<str>>(&self, reason: S) -> Error {
self.error_at(self.pos, reason)
}
fn error_at<S: AsRef<str>>(&self, pos: usize, reason: S) -> Error {
Error::Else {
path: self.path(),
location: self.line_info(pos),
reason: reason.as_ref().into(),
}
}
fn next_char(&mut self, skp: usize) -> Option<char> {
let source = &self.source[self.index..];
source.chars().nth(skp).to_owned()
}
fn next_nchar(&mut self, n: usize) -> String {
let source = &self.source[self.index..];
source.chars().take(n).collect()
}
#[rustfmt::skip]
fn try_insert_semicolon2(&mut self, tok: &Token) -> bool {
matches!(
tok,
| &Token::Literal(..)
| &Token::Operator(Operator::Inc)
| &Token::Operator(Operator::Dec)
| &Token::Operator(Operator::ParenRight)
| &Token::Operator(Operator::BraceRight)
| &Token::Operator(Operator::BarackRight)
| &Token::Keyword(Keyword::Break)
| &Token::Keyword(Keyword::Return)
| &Token::Keyword(Keyword::Continue)
| &Token::Keyword(Keyword::FallThrough)
)
}
fn line_ended(&mut self) -> bool {
let mut chars = self.source[self.index..].chars().peekable();
loop {
match chars.next() {
None | Some('\n') => return true,
Some(c) if c.is_whitespace() => continue,
Some('/') => match chars.next() {
Some('/') => return true,
Some('*') => loop {
match chars.next() {
None | Some('\n') => return true,
Some('*') if chars.next_if_eq(&'/').is_some() => break,
_ => continue,
}
},
_ => return false,
},
_ => return false,
}
}
}
pub(crate) fn skip_whitespace(&mut self) -> usize {
let mut skipped = 0;
while let Some(ch) = self.next_char(0) {
if ch.is_whitespace() {
if ch == '\n' {
self.add_line(self.pos + 1);
}
skipped += 1;
self.pos += 1;
self.index += 1;
continue;
}
break;
}
skipped
}
pub(crate) fn next_token(&mut self) -> Result<Option<(usize, Token)>> {
if self.semicolon && self.line_ended() {
let pos = self.pos;
let tok = Token::Operator(Operator::SemiColon);
self.semicolon = false;
return Ok(Some((pos, tok)));
}
self.semicolon = false;
self.skip_whitespace();
if self.index >= self.source.len() {
return Ok(None);
}
let current = self.pos;
let tok = self.scan_token()?;
self.add_token_cross_line(&tok);
self.index += tok.str_len();
self.pos += tok.char_count();
self.semicolon = self.try_insert_semicolon2(&tok);
Ok(Some((current, tok)))
}
fn add_token_cross_line(&mut self, tok: &Token) {
match tok {
Token::Comment(text) | Token::Literal(.., text) => {
for (index, ch) in text.chars().enumerate() {
if ch == '\n' {
self.add_line(self.pos + index + 1)
}
}
}
_ => {}
}
}
pub(crate) fn scan_token(&mut self) -> Result<Token> {
if let Ok(op) = Operator::from_str(&self.next_nchar(3)) {
return Ok(op.into());
}
if let Some(tok) = match self.next_nchar(2).as_str() {
"//" => Some(Token::Comment(self.scan_line_comment())),
"/*" => Some(Token::Comment(self.scan_general_comment()?)),
two => Operator::from_str(two).ok().map(|op| op.into()),
} {
return Ok(tok);
}
let next0_char = self.next_char(0).unwrap();
let next1_is_digits = matches!(self.next_char(1), Some('0'..='9'));
let next0_char_op = Operator::from_str(&next0_char.to_string()).ok();
Ok(match next0_char {
c if is_decimal_digit(c) => self.scan_lit_number()?,
'.' if next1_is_digits => self.scan_lit_number()?,
'\'' => Token::Literal(LitKind::Char, self.scan_lit_rune()?),
'"' | '`' => Token::Literal(LitKind::String, self.scan_lit_string()?),
ch if is_letter(ch) => {
let identifier = self.scan_identifier();
match Keyword::from_str(&identifier) {
Ok(word) => Token::Keyword(word),
_ => Token::Literal(LitKind::Ident, identifier),
}
}
other => match next0_char_op {
Some(op) => op.into(),
_ => return Err(self.error(format!("unresolved character {:?}", other))),
},
})
}
fn scan_line_comment(&mut self) -> String {
self.source[self.index..]
.chars()
.take_while(|&ch| ch != '\n')
.collect()
}
fn scan_general_comment(&mut self) -> Result<String> {
let source = &self.source[self.index..];
assert_eq!(&source[0..2], "/*");
let mut result = String::from("/*");
let mut chars = source.chars().skip(2).peekable();
while let Some(ch) = chars.next() {
result.push(ch);
if ch == '*' && chars.peek() == Some(&'/') {
result.push('/');
break;
}
}
match result.ends_with("*/") {
true => Ok(result),
false => Err(self.error("comment no termination '*/'")),
}
}
fn scan_identifier(&mut self) -> String {
self.source[self.index..]
.chars()
.take_while(|&ch| is_letter(ch) || is_unicode_digit(ch))
.collect()
}
fn scan_rune(&mut self, index: usize) -> Result<String> {
let source = &self.source[index..];
let mut chars = source.chars();
let (next1, next2) = (chars.next(), chars.next());
let must_be = |ch: Option<char>, valid: fn(char) -> bool| match ch {
Some(ch) if valid(ch) => Ok(ch),
Some(_) => Err(self.error("illegal rune literal")),
None => Err(self.error("literal not terminated")),
};
let mut match_n = |n, valid| {
(0..)
.take(n)
.map(|_| must_be(chars.next(), valid))
.collect::<Result<Vec<char>>>()
};
let es_sequence = match next1 {
Some('\\') => match next2 {
Some('x') => match_n(2, is_hex_digit)?,
Some('u') => match_n(4, is_hex_digit)?,
Some('U') => match_n(8, is_hex_digit)?,
Some(ch) if is_octal_digit(ch) => match_n(2, is_octal_digit)?,
Some(ch) if is_escaped_char(ch) => return Ok(format!("\\{}", ch)),
Some(_) => return Err(self.error("unknown escape sequence")),
None => return Err(self.error("literal not terminated")),
},
Some(ch) if is_unicode_char(ch) => return Ok(String::from(ch)),
None => return Err(self.error_at(self.pos, "literal not terminated")),
Some(_) => return Err(self.error_at(self.pos, "unexpected character")),
};
let es_sequence = [vec![next1.unwrap(), next2.unwrap()], es_sequence].concat();
match es_sequence.get(1).unwrap() {
'x' | 'u' | 'U' => Some((16, &es_sequence[2..])),
_ => Some((8, &es_sequence[1..])), }
.and_then(|(radix, sequence)| {
char::from_u32(
u32::from_str_radix(&String::from_iter(sequence), radix)
.expect("here must be a valid u32"),
)
})
.ok_or_else(|| self.error("invalid Unicode code point"))?;
Ok(es_sequence.iter().collect())
}
fn scan_lit_rune(&mut self) -> Result<String> {
let source = &self.source[self.index..];
assert_eq!(&source[0..1], "'");
let rune = self.scan_rune(self.index + 1)?;
let index = self.index + 1 + rune.len();
match self.source.get(index..index + 1) {
Some("'") => Ok(format!("'{}'", rune)),
Some(_) => Err(self.error_at(self.pos, "rune literal expect termination")),
None => Err(self.error_at(self.pos, "rune literal not termination")),
}
}
fn scan_lit_string(&mut self) -> Result<String> {
let source = &self.source[self.index..];
let mut chars = source.chars();
let mut result = String::new();
let quote = chars.next().unwrap();
result.push(quote);
if quote == '`' {
for ch in chars.by_ref() {
result.push(ch);
if ch == quote {
break;
}
}
} else {
let quote = quote.to_string();
let mut index = self.index + 1;
while chars.next().is_some() {
let rune = self.scan_rune(index)?;
index += rune.len();
result.push_str(&rune);
chars = self.source[index..].chars();
if rune == quote {
break;
}
}
}
let offset = self.pos + result.chars().count();
match result.ends_with(quote) {
true => Ok(result),
_ => Err(self.error_at(offset, "string literal not terminated")),
}
}
fn scan_digits(&mut self, n: usize, valid: fn(char) -> bool) -> String {
self.source[self.index + n..]
.chars()
.scan(true, |state, item| {
(item != '_' || *state).then(|| {
*state = item != '_';
item
})
})
.take_while(|&ch| ch == '_' || valid(ch))
.collect()
}
fn scan_lit_number(&mut self) -> Result<Token> {
let chars = self.source[self.index..].chars();
let next2 = chars.take(2).collect::<String>();
let (radix, int_part) = self
.next_char(0)
.and_then(|ch| match ch {
'.' => None,
_ => Some(match next2.as_str() {
"0b" | "oB" => (2, next2 + &self.scan_digits(2, is_binary_digit)),
"0o" | "0O" => (8, next2 + &self.scan_digits(2, is_decimal_digit)),
"0x" | "0X" => (16, next2 + &self.scan_digits(2, is_hex_digit)),
_ => (10, self.scan_digits(0, is_decimal_digit)),
}),
})
.unwrap_or((10, String::new()));
if int_part.ends_with('_') {
return Err(self.error_at(
self.pos + int_part.len(),
"'_' must separate successive digits",
));
}
let skipped = int_part.len();
let fac_part = (self.next_char(skipped) == Some('.'))
.then(|| match radix {
2 | 8 => Err(self.error_at(self.pos + skipped, "invalid radix point")),
16 => Ok(".".to_owned() + &self.scan_digits(skipped + 1, is_hex_digit)),
_ => Ok(".".to_owned() + &self.scan_digits(skipped + 1, is_decimal_digit)),
})
.unwrap_or(Ok(String::new()))?;
if fac_part.starts_with("._") || fac_part.ends_with('_') {
return Err(self.error_at(self.pos + skipped, "'_' must separate successive digits"));
}
let next1 = self.next_char(skipped);
let skipped = int_part.len() + fac_part.len();
if int_part.len() + fac_part.len() == 0 {
return Err(self.error_at(self.pos + skipped, "invalid radix point"));
} else if radix == 16 && (int_part.len() == 2) && (fac_part.len() == 1) {
return Err(self.error_at(self.pos + skipped, "mantissa has no digits"));
} else if matches!(next1, Some('e' | 'E')) && radix != 10 {
return Err(self.error_at(self.pos + skipped, "E exponent requires decimal mantissa"));
} else if matches!(next1, Some('p' | 'P')) && radix != 16 {
return Err(self.error_at(
self.pos + skipped,
"P exponent requires hexadecimal mantissa",
));
};
let mut skipped = int_part.len() + fac_part.len();
let exp_part = self
.next_char(skipped)
.and_then(|exp| {
matches!(exp, 'e' | 'E' | 'p' | 'P').then(|| {
let mut skipped = skipped;
(match self.next_char(skipped + 1) {
Some(signed @ ('+' | '-')) => {
skipped += 2;
format!("{}{}", exp, signed)
}
_ => {
skipped += 1;
format!("{}", exp)
}
}) + &self.scan_digits(
skipped,
if radix == 16 {
is_hex_digit
} else {
is_decimal_digit
},
)
})
})
.unwrap_or_default();
if radix == 16 && !fac_part.is_empty() && exp_part.is_empty() {
return Err(self.error_at(
self.pos + skipped + exp_part.len(),
"mantissa has no digits",
));
}
if exp_part
.chars()
.skip(1) .find(|&ch| ch != '+' && ch != '-')
== Some('_')
|| exp_part.ends_with('_')
{
return Err(self.error_at(
self.pos + skipped + exp_part.len(),
"'_' must separate successive digits",
));
}
skipped += exp_part.len();
let num_part = [int_part, fac_part, exp_part].concat();
if self.next_char(skipped) == Some('i') {
Ok(Token::Literal(LitKind::Imag, num_part + "i"))
} else if num_part.find('.').is_some() {
Ok(Token::Literal(LitKind::Float, num_part))
} else {
Ok(Token::Literal(LitKind::Integer, num_part))
}
}
}
fn is_unicode_char(c: char) -> bool {
!is_newline(c)
}
fn is_newline(c: char) -> bool {
c == '\u{000A}'
}
fn is_unicode_letter(c: char) -> bool {
GeneralCategory::of(c).is_letter()
}
fn is_unicode_digit(c: char) -> bool {
matches!(GeneralCategory::of(c), GeneralCategory::DecimalNumber)
}
fn is_letter(c: char) -> bool {
is_unicode_letter(c) || c == '_'
}
fn is_binary_digit(c: char) -> bool {
matches!(c, '0'..='1')
}
fn is_octal_digit(c: char) -> bool {
matches!(c, '0'..='7')
}
fn is_decimal_digit(c: char) -> bool {
matches!(c, '0'..='9')
}
fn is_hex_digit(c: char) -> bool {
matches!(c, '0'..='9' | 'a'..='f' | 'A'..='F')
}
fn is_escaped_char(c: char) -> bool {
['a', 'b', 'f', 'n', 'r', 't', 'v', '\\', '\'', '"'].contains(&c)
}
#[cfg(test)]
mod tests {
use super::Scanner;
use crate::token::Operator;
use crate::token::Token;
#[test]
fn line_ended() {
let ended = |s| Scanner::from(s).line_ended();
assert!(!ended("1"));
assert!(!ended("/**/123"));
assert!(ended("//"));
assert!(ended("/**/"));
assert!(ended("/*\n*/"));
assert!(ended("/**/ //"));
}
#[test]
fn insert_semicolon() {
const SEMI: Token = Token::Operator(Operator::SemiColon);
let mut scan = Scanner::from("return//\nreturn");
let mut next = move || scan.next_token();
assert!(!matches!(next(), Ok(Some((_, SEMI)))));
assert!(matches!(next(), Ok(Some((_, SEMI)))));
assert!(!matches!(next(), Ok(Some((_, SEMI)))));
let mut scan = Scanner::from("a\nb\nc");
let mut next = move || scan.next_token();
assert!(!matches!(next(), Ok(Some((_, SEMI)))));
assert!(matches!(next(), Ok(Some((_, SEMI)))));
assert!(!matches!(next(), Ok(Some((_, SEMI)))));
assert!(matches!(next(), Ok(Some((_, SEMI)))));
assert!(!matches!(next(), Ok(Some((_, SEMI)))));
}
#[test]
fn scan_text() {
let mut scan = Scanner::from("'一', '二', '三'");
assert!(scan.next_token().is_ok());
assert!(scan.next_token().is_ok());
assert!(scan.next_token().is_ok());
assert!(scan.next_token().is_ok());
assert!(scan.next_token().is_ok());
let mut scan = Scanner::from("n%9");
assert!(scan.next_token().is_ok());
assert!(scan.next_token().is_ok());
assert!(scan.next_token().is_ok());
}
#[test]
fn scan_lit_number() {
let numeric = |s: &str| {
let mut sc = Scanner::from(s);
let n = sc.scan_lit_number()?;
if n.str_len() != s.len() {
return Err(sc.error("scan not finished"));
}
Ok(n)
};
assert!(numeric("1").is_ok());
assert!(numeric("42").is_ok());
assert!(numeric("4_2").is_ok());
assert!(numeric("0600").is_ok());
assert!(numeric("0_600").is_ok());
assert!(numeric("0o600").is_ok());
assert!(numeric("0O600").is_ok());
assert!(numeric("0xBadFace").is_ok());
assert!(numeric("0xBad_Face").is_ok());
assert!(numeric("0x_67_7a_2f_cc_40_c6").is_ok());
assert!(numeric("170141183460469231731687303715884105727").is_ok());
assert!(numeric("170_141183_460469_231731_687303_715884_105727").is_ok());
assert!(numeric("42_").is_err());
assert!(numeric("4__2").is_err());
assert!(numeric("0_xBadFace").is_err());
assert!(numeric("0.").is_ok());
assert!(numeric("1E6").is_ok());
assert!(numeric(".25").is_ok());
assert!(numeric("1_5.").is_ok());
assert!(numeric("72.40").is_ok());
assert!(numeric("1.e+0").is_ok());
assert!(numeric("072.40").is_ok());
assert!(numeric("2.71828").is_ok());
assert!(numeric(".12345E+5").is_ok());
assert!(numeric("0.15e+0_2").is_ok());
assert!(numeric("6.67428e-11").is_ok());
assert!(numeric("0x15e").is_ok());
assert!(numeric("0x1p-2").is_ok());
assert!(numeric("0x2.p10").is_ok());
assert!(numeric("0X.8p-0").is_ok());
assert!(numeric("0x1.Fp+0").is_ok());
assert!(numeric("0X_1FFFP-16").is_ok());
assert!(numeric("1p-2").is_err());
assert!(numeric("1_.5").is_err());
assert!(numeric("1._5").is_err());
assert!(numeric("0x.p1").is_err());
assert!(numeric("1.5_e1").is_err());
assert!(numeric("1.5e_1").is_err());
assert!(numeric("1.5e1_").is_err());
assert!(numeric("1.5e+_1").is_err());
assert!(numeric("0x1.5e-2").is_err());
assert!(numeric("0i").is_ok());
assert!(numeric("0.i").is_ok());
assert!(numeric("1E6i").is_ok());
assert!(numeric(".25i").is_ok());
assert!(numeric("0123i").is_ok());
assert!(numeric("0o123i").is_ok());
assert!(numeric("0xabci").is_ok());
assert!(numeric("1.e+0i").is_ok());
assert!(numeric("2.71828i").is_ok());
assert!(numeric("0x1p-2i").is_ok());
assert!(numeric(".12345E+5i").is_ok());
assert!(numeric("6.67428e-11i").is_ok());
assert!(numeric("7.7388724745781045e+00i").is_ok());
}
#[test]
fn scan_lit_rune() {
let rune = |s| Scanner::from(s).scan_lit_rune();
assert!(rune(r#"'a'"#).is_ok());
assert!(rune(r#"'ä'"#).is_ok());
assert!(rune(r#"'本'"#).is_ok());
assert!(rune(r#"'\t'"#).is_ok());
assert!(rune(r#"'\000'"#).is_ok());
assert!(rune(r#"'\007'"#).is_ok());
assert!(rune(r#"'\377'"#).is_ok());
assert!(rune(r#"'\x07'"#).is_ok());
assert!(rune(r#"'\xff'"#).is_ok());
assert!(rune(r#"'\u12e4'"#).is_ok());
assert!(rune(r#"'\U00101234'"#).is_ok());
assert!(rune(r#"'\''"#).is_ok());
assert!(rune(r#"'aa'"#).is_err());
assert!(rune(r#"'\xa'"#).is_err());
assert!(rune(r#"'\0'"#).is_err());
assert!(rune(r#"'\uDFFF'"#).is_err());
assert!(rune(r#"'\U00110000'"#).is_err());
}
#[test]
fn scan_lit_string() {
let lit_str = |s| Scanner::from(s).scan_lit_string();
assert!(lit_str("`abc`").is_ok());
assert!(lit_str(r#""\n""#).is_ok());
assert!(lit_str(r#""\"""#).is_ok());
assert!(lit_str(r#""日本語""#).is_ok());
assert!(lit_str("`\\n\n\\n`").is_ok());
assert!(lit_str(r#""\xff\u00FF""#).is_ok());
assert!(lit_str(r#""Hello, world!\n""#).is_ok());
assert!(lit_str(r#""\u65e5本\U00008a9e""#).is_ok());
assert!(lit_str(r#""\uD800""#).is_err());
assert!(lit_str(r#""\U00110000""#).is_err());
}
#[test]
fn scan_comment() {
let mut comments = Scanner::from("// 注释\r\n//123\n/*注释*/");
let next = comments.next_token();
assert!(match next {
Ok(Some((_, Token::Comment(comment)))) => comment == "// 注释\r",
_ => false,
});
let next = comments.next_token();
assert!(match next {
Ok(Some((_, Token::Comment(comment)))) => comment == "//123",
_ => false,
});
let next = comments.next_token();
assert!(match next {
Ok(Some((_, Token::Comment(comment)))) => comment == "/*注释*/",
_ => false,
});
}
#[test]
fn scan_line_info() {
let code = "package main 代码 \n/*😃\n注释*/\n\n//🎃\n\n";
let mut scanner = Scanner::from(code);
while let Ok(Some(..)) = scanner.next_token() {}
let mut lines = scanner.lines.iter();
assert_eq!(lines.next(), Some(&17));
assert_eq!(lines.next(), Some(&21));
assert_eq!(lines.next(), Some(&26));
assert_eq!(lines.next(), Some(&27));
assert_eq!(lines.next(), Some(&31));
assert_eq!(lines.next(), Some(&32));
assert_eq!(lines.next(), None);
}
#[test]
fn get_line_info() {
let scanner = Scanner {
lines: vec![10, 20, 30],
..Default::default()
};
assert_eq!(scanner.line_info(5), (1, 5));
assert_eq!(scanner.line_info(20), (2, 0));
assert_eq!(scanner.line_info(50), (3, 20));
}
}