use super::grammar::Grammar;
use crate::builder::Buildable;
use crate::error::ErrorKind;
use crate::error::Result;
use crate::parser::AST;
use crate::regex::Allowed;
use crate::span::Span;
use crate::stream::StringStream;
use fragile::Fragile;
use newty::newty;
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::fmt;
use std::ops::Index;
use std::path::Path;
use std::path::PathBuf;
newty! {
#[derive(PartialOrd, Ord)]
pub id TerminalId
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct Token {
name: String,
id: TerminalId,
attributes: HashMap<usize, String>,
span: Span,
}
impl fmt::Display for Token {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "({})", self.name)
}
}
impl Index<&usize> for Token {
type Output = String;
fn index(&self, key: &usize) -> &Self::Output {
&self.attributes[key]
}
}
impl Token {
pub fn new(
name: String,
id: TerminalId,
attributes: HashMap<usize, String>,
span: Span,
) -> Self {
Self {
name,
id,
attributes,
span,
}
}
pub fn contains(&self, key: usize) -> bool {
self.attributes.contains_key(&key)
}
pub fn get(&self, key: usize) -> Option<&str> {
self.attributes.get(&key).map(|x| x.as_str())
}
pub fn attributes(&self) -> &HashMap<usize, String> {
&self.attributes
}
pub fn content(&self) -> &str {
self.force_get(0)
}
pub fn force_get(&self, key: usize) -> &str {
self.get(key).unwrap()
}
pub fn name(&self) -> &str {
self.name.as_str()
}
pub fn id(&self) -> TerminalId {
self.id
}
pub fn span(&self) -> &Span {
&self.span
}
}
#[derive(Debug)]
pub struct LexedStream<'lexer, 'stream> {
pub(crate) lexer: &'lexer Lexer,
stream: &'stream mut StringStream,
pos: usize,
tokens: Vec<(usize, Token)>,
last_span: Span,
}
impl<'lexer, 'stream> LexedStream<'lexer, 'stream> {
pub fn new(lexer: &'lexer Lexer, stream: &'stream mut StringStream) -> Self {
Self {
last_span: Span::new(
stream.origin(),
(0, 0),
(0, 0),
0,
0,
stream.text(),
stream.lines(),
),
lexer,
stream,
pos: 0,
tokens: Vec::new(),
}
}
fn lex_next(&mut self, allowed: Allowed) -> Result<bool> {
'lex: loop {
if self.stream.is_empty() {
break 'lex Ok(false);
} else if let Some(result) = self
.lexer
.grammar()
.pattern()
.find(self.stream.peek(), &allowed)
{
let name = result.name().to_string();
let mut attributes = HashMap::new();
for (i, attr) in result.groups().iter().enumerate() {
if let Some(a) = attr {
attributes.insert(i, a.text(self.stream.peek()).to_string());
}
}
let start = self.stream.pos();
self.stream.shift(result.chars_length());
let end = self.stream.pos();
let span = self.stream.span_between(start, end - 1);
if let Some(err_message) = self.lexer.grammar().err_message(result.id()) {
break 'lex ErrorKind::UnwantedToken {
span: Fragile::new(span),
message: err_message.to_string(),
}
.err();
}
if self.lexer.grammar().ignored(result.id()) {
continue;
}
let id = self.lexer.grammar.id(&name).unwrap();
let token = Token::new(name, id, attributes, span.clone());
self.last_span = span;
self.tokens.push((start, token));
break 'lex Ok(true);
} else {
break 'lex ErrorKind::LexingError {
span: Fragile::new(self.stream.curr_span()),
}
.err();
}
}
}
pub fn last_span(&self) -> &Span {
&self.last_span
}
}
impl LexedStream<'_, '_> {
pub fn next_any(&mut self) -> Result<Option<&Token>> {
self.next(Allowed::All)
}
pub fn next(&mut self, allowed: Allowed) -> Result<Option<&Token>> {
self.pos += 1;
if self.lex_next(allowed)? {
Ok(self.tokens.last().map(|(_, token)| token))
} else {
Ok(None)
}
}
pub fn peek(&self) -> Option<&Token> {
self.tokens.last().map(|(_, token)| token)
}
pub fn drop_last(&mut self) {
if let Some((pos, _)) = self.tokens.pop() {
self.pos -= 1;
while self.stream.pos() > pos {
self.stream.decr_pos();
}
}
}
pub fn lexer(&self) -> &Lexer {
self.lexer
}
pub fn is_empty(&self) -> bool {
self.stream.is_empty()
}
}
#[derive(Debug)]
pub struct Lexer {
grammar: Grammar,
}
impl Lexer {
pub fn new(grammar: Grammar) -> Self {
Self { grammar }
}
pub fn lex<'lexer, 'stream>(
&'lexer self,
stream: &'stream mut StringStream,
) -> LexedStream<'lexer, 'stream> {
LexedStream::new(self, stream)
}
pub fn grammar(&self) -> &Grammar {
&self.grammar
}
pub fn from_path(path: &Path) -> Result<Self> {
let grammar = Grammar::build_from_path(path)?;
Ok(Self { grammar })
}
}
impl Buildable for Lexer {
const RAW_EXTENSION: &'static str = Grammar::RAW_EXTENSION;
const COMPILED_EXTENSION: &'static str = Grammar::COMPILED_EXTENSION;
const AST_EXTENSION: &'static str = Grammar::AST_EXTENSION;
fn build_from_ast(ast: AST) -> Result<Self> {
let grammar = Grammar::build_from_ast(ast)?;
Ok(Self { grammar })
}
fn build_from_compiled(blob: &[u8], path: impl ToOwned<Owned = PathBuf>) -> Result<Self> {
let grammar = Grammar::build_from_compiled(blob, path)?;
Ok(Self { grammar })
}
fn build_from_plain(raw: StringStream) -> Result<Self> {
let grammar = Grammar::build_from_plain(raw)?;
Ok(Self { grammar })
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::span::Location;
macro_rules! test_token {
($name: ident $($key:literal = $value: tt), *) => {
{
let name = stringify!($name).to_string();
#[allow(unused_mut)]
let mut attributes = HashMap::new();
$(attributes.insert($key, stringify!($value).to_string());)*
TestToken {
name,
attributes
}
}
}
}
macro_rules! id_token {
($content: ident) => {
test_token!(ID 0=$content)
};
}
#[derive(Debug)]
struct TestToken {
name: String,
attributes: HashMap<usize, String>,
}
impl PartialEq<Token> for TestToken {
fn eq(&self, right: &Token) -> bool {
self.name == right.name
&& self.attributes.iter().all(|(key, value)| {
right.attributes.get(key).filter(|&v| v == value).is_some()
})
}
}
#[test]
fn token() {
let token = Token::new(
String::from("wow"),
0.into(),
HashMap::new(),
Span::new(
Path::new("test_file"),
(3, 0),
(3, 2),
10,
12,
"",
Vec::new(),
),
);
assert_eq!(token.name(), "wow");
assert_eq!(token.id(), 0.into());
assert_eq!(&*token.span().file(), Path::new("test_file"));
assert_eq!(token.span().start(), (3, 0));
assert_eq!(token.span().end(), (3, 2));
}
#[test]
fn lexer_builder() {
Lexer::build_from_plain(StringStream::new(
Path::new("<building the lexer>"),
"A ::= blu",
))
.unwrap();
}
#[test]
fn lex_basic() {
let lexer = Lexer::build_from_plain(StringStream::new(
Path::new("<basic lexing>"),
"A ::= blu",
))
.unwrap();
let mut input = StringStream::new(Path::new("input file"), "blu");
let mut lexed_input = lexer.lex(&mut input);
let token = lexed_input.next(Allowed::All).unwrap().unwrap();
assert_eq!(token.span().start(), (0, 0));
assert_eq!(token.span().end(), (0, 2));
assert!(lexed_input.next(Allowed::All).unwrap().is_none());
}
#[test]
fn unwantend_token() {
let lexer = Lexer::build_from_plain(StringStream::new(
Path::new("<unwanted token>"),
r#"ignore COMMENT ::= /\*([^*]|\*[^/])\*/
(unclosed comment) unwanted ECOMMENT ::= /\*([^*]|\*[^/])"#,
))
.unwrap();
let mut input = StringStream::new(Path::new("<unwanted input>"), "/* hello /");
let mut lexed_input = lexer.lex(&mut input);
let ErrorKind::UnwantedToken { message, .. } =
*lexed_input.next(Allowed::All).unwrap_err().kind
else {
panic!("wrong error");
};
assert_eq!("unclosed comment", message);
}
fn verify_input(
mut lexed_input: LexedStream<'_, '_>,
result: &[(Location, Location, &str)],
) {
let origin = &*lexed_input.stream.origin();
let mut i = 0;
while let Some(token) = lexed_input.next_any().unwrap() {
let (start, end, name) = result[i];
assert_eq!(
token.span().start(),
start,
"Token #{} {} differ by start location in stream {}.",
i,
token,
origin.display()
);
assert_eq!(
token.span().end(),
end,
"Token #{} {} differ by end location in stream {}.",
i,
token,
origin.display()
);
assert_eq!(
token.name(),
name,
"Token #{} {} differ by name in stream {}.",
i,
token,
origin.display()
);
i += 1;
}
assert_eq!(result.len(), i);
}
#[test]
fn default_lex_grammar() {
let lexer = Lexer::build_from_path(Path::new("src/parser/gmrs/dummy.lx")).unwrap();
let result = [
((0, 0), (0, 2), "ID"),
((0, 4), (0, 4), "PLUS"),
((0, 6), (0, 8), "ID"),
];
verify_input(
lexer.lex(&mut StringStream::new(Path::new("<input>"), "one + two")),
&result,
);
let result = [
((0, 0), (0, 1), "IF"),
((0, 3), (0, 6), "TRUE"),
((0, 8), (0, 10), "AND"),
((0, 12), (0, 16), "FALSE"),
((0, 18), (0, 18), "LBRACE"),
((1, 1), (1, 5), "ID"),
((1, 6), (1, 6), "LPAR"),
((1, 7), (1, 17), "STRING"),
((1, 18), (1, 18), "RPAR"),
((2, 0), (2, 0), "RBRACE"),
];
verify_input(
lexer.lex(&mut StringStream::new(
Path::new("<input>"),
"if true and false {\n\tifeat(\"something\")\n}",
)),
&result,
);
}
#[test]
fn default_parser_grammar() {
let lexer = Lexer::build_from_path(Path::new("src/parser/gmrs/earley.lx")).unwrap();
let mut input = StringStream::from_file(Path::new("src/parser/gmrs/dummy.gr")).unwrap();
let mut lexed_input = lexer.lex(&mut input);
let result = [
id_token!(IfStatement),
test_token!(ASSIGNMENT),
id_token!(IF),
id_token!(Expression),
test_token!(AT),
id_token!(condition),
id_token!(LBRACE),
id_token!(StatementList),
test_token!(AT),
id_token!(then),
id_token!(RBRACE),
test_token!(LPROXY),
id_token!(NoElse),
test_token!(RPROXY),
id_token!(IF),
id_token!(Expression),
test_token!(AT),
id_token!(condition),
id_token!(LBRACE),
id_token!(StatementList),
test_token!(AT),
id_token!(then),
id_token!(RBRACE),
id_token!(ELSE),
id_token!(LBRACE),
id_token!(StatementList),
test_token!(AT),
id_token!(else),
id_token!(RBRACE),
test_token!(LPROXY),
id_token!(Else),
test_token!(RPROXY),
test_token!(SEMICOLON),
id_token!(WhileStatement),
test_token!(ASSIGNMENT),
id_token!(WHILE),
id_token!(Expression),
test_token!(AT),
id_token!(condition),
id_token!(LBRACE),
id_token!(StatementList),
test_token!(AT),
id_token!(do),
id_token!(RBRACE),
test_token!(LPROXY),
test_token!(RPROXY),
test_token!(SEMICOLON),
id_token!(Assignment),
test_token!(ASSIGNMENT),
id_token!(ID),
test_token!(DOT),
test_token!(INT 0=0),
test_token!(AT),
id_token!(key),
id_token!(EQUALS),
id_token!(Expression),
test_token!(AT),
id_token!(value),
test_token!(LPROXY),
test_token!(RPROXY),
test_token!(SEMICOLON),
id_token!(BuiltinType),
test_token!(ASSIGNMENT),
id_token!(INT),
test_token!(DOT),
test_token!(INT 0=0),
test_token!(AT),
id_token!(value),
test_token!(LPROXY),
id_token!(Int),
test_token!(RPROXY),
id_token!(STRING),
test_token!(DOT),
test_token!(INT 0=0),
test_token!(AT),
id_token!(value),
test_token!(LPROXY),
id_token!(String),
test_token!(RPROXY),
id_token!(ID),
test_token!(DOT),
test_token!(INT 0=0),
test_token!(AT),
id_token!(value),
test_token!(LPROXY),
id_token!(Id),
test_token!(RPROXY),
id_token!(TRUE),
test_token!(LPROXY),
id_token!(True),
test_token!(RPROXY),
id_token!(FALSE),
test_token!(LPROXY),
id_token!(False),
test_token!(RPROXY),
test_token!(SEMICOLON),
id_token!(Atom),
test_token!(ASSIGNMENT),
id_token!(BuiltinType),
test_token!(AT),
id_token!(this),
test_token!(LPROXY),
id_token!(Builtin),
test_token!(RPROXY),
id_token!(LPAR),
id_token!(Expression),
test_token!(AT),
id_token!(this),
id_token!(RPAR),
test_token!(LPROXY),
id_token!(Through),
test_token!(RPROXY),
test_token!(SEMICOLON),
id_token!(Expression),
test_token!(ASSIGNMENT),
id_token!(Expression),
test_token!(AT),
id_token!(left),
id_token!(PLUS),
id_token!(Expression),
test_token!(AT),
id_token!(right),
test_token!(LPROXY),
id_token!(Add),
test_token!(RPROXY),
id_token!(Expression),
test_token!(AT),
id_token!(left),
id_token!(ASTERISK),
id_token!(Expression),
test_token!(AT),
id_token!(right),
test_token!(LPROXY),
id_token!(Mul),
test_token!(RPROXY),
id_token!(Atom),
test_token!(AT),
id_token!(this),
test_token!(LPROXY),
id_token!(Through),
test_token!(RPROXY),
test_token!(SEMICOLON),
id_token!(Statement),
test_token!(ASSIGNMENT),
id_token!(Assignment),
test_token!(AT),
id_token!(this),
id_token!(SEMICOLON),
test_token!(LPROXY),
id_token!(Assign),
test_token!(RPROXY),
id_token!(IfStatement),
test_token!(AT),
id_token!(this),
test_token!(LPROXY),
id_token!(If),
test_token!(RPROXY),
id_token!(WhileStatement),
test_token!(AT),
id_token!(this),
test_token!(LPROXY),
id_token!(While),
test_token!(RPROXY),
test_token!(SEMICOLON),
test_token!(AT),
id_token!(StatementList),
test_token!(ASSIGNMENT),
id_token!(StatementList),
test_token!(AT),
id_token!(left),
id_token!(Statement),
test_token!(AT),
id_token!(right),
test_token!(LPROXY),
id_token!(Concat),
test_token!(RPROXY),
id_token!(Statement),
test_token!(AT),
id_token!(this),
test_token!(LPROXY),
id_token!(Through),
test_token!(RPROXY),
test_token!(SEMICOLON),
];
for (i, tok) in result.iter().enumerate() {
let token = lexed_input
.next(Allowed::All)
.unwrap()
.unwrap_or_else(|| panic!("Expected token named {}, found EOF", tok.name));
assert_eq!(
tok,
token,
"Lexer error @{} {}:{} does not match token #{}",
token.span().file().display(),
token.span().start().0 + 1,
token.span().end().0 + 1,
i
);
}
if let Some(token) = lexed_input.next(Allowed::All).unwrap() {
panic!(
"Lexer error @{} {}:{} does not match token EOF",
token.span().file().display(),
token.span().start().0 + 1,
token.span().end().0 + 1
);
}
}
}