use std::{
collections::{HashMap, HashSet},
hash::Hash,
slice::Iter
};
use num_traits::{PrimInt, Unsigned};
use regex::{self, Regex, RegexBuilder};
use lrpar::{LexError, Lexeme, Lexer};
pub struct Rule<StorageT> {
pub tok_id: Option<StorageT>,
pub name: Option<String>,
pub re_str: String,
pub re: Regex
}
impl<StorageT> Rule<StorageT> {
#[doc(hidden)]
pub fn new(
tok_id: Option<StorageT>,
name: Option<String>,
re_str: String
) -> Result<Rule<StorageT>, regex::Error> {
let re = RegexBuilder::new(&format!("\\A(?:{})", &re_str))
.multi_line(true)
.dot_matches_new_line(true)
.build()?;
Ok(Rule {
tok_id,
name,
re_str,
re
})
}
}
pub struct LexerDef<StorageT> {
pub(crate) rules: Vec<Rule<StorageT>>
}
impl<StorageT: Copy + Eq + Hash + PrimInt + Unsigned> LexerDef<StorageT> {
pub fn new(rules: Vec<Rule<StorageT>>) -> LexerDef<StorageT> {
LexerDef { rules }
}
pub fn get_rule(&self, idx: usize) -> Option<&Rule<StorageT>> {
self.rules.get(idx)
}
pub fn get_rule_by_id(&self, tok_id: StorageT) -> &Rule<StorageT> {
&self
.rules
.iter()
.find(|r| r.tok_id == Some(tok_id))
.unwrap()
}
pub fn get_rule_by_name(&self, n: &str) -> Option<&Rule<StorageT>> {
self.rules
.iter()
.find(|r| r.name.as_ref().map(String::as_str) == Some(n))
}
pub fn set_rule_ids<'a>(
&'a mut self,
rule_ids_map: &HashMap<&'a str, StorageT>
) -> (Option<HashSet<&'a str>>, Option<HashSet<&'a str>>) {
let mut missing_from_parser_idxs = Vec::new();
let mut rules_with_names = 0;
for (i, r) in self.rules.iter_mut().enumerate() {
if let Some(ref n) = r.name {
match rule_ids_map.get(&**n) {
Some(tok_id) => r.tok_id = Some(*tok_id),
None => {
r.tok_id = None;
missing_from_parser_idxs.push(i);
}
}
rules_with_names += 1;
}
}
let missing_from_parser;
if missing_from_parser_idxs.is_empty() {
missing_from_parser = None;
} else {
let mut mfp = HashSet::with_capacity(missing_from_parser_idxs.len());
for i in &missing_from_parser_idxs {
mfp.insert(self.rules[*i].name.as_ref().unwrap().as_str());
}
missing_from_parser = Some(mfp);
};
let missing_from_lexer;
if rules_with_names - missing_from_parser_idxs.len() == rule_ids_map.len() {
missing_from_lexer = None
} else {
missing_from_lexer = Some(
rule_ids_map
.keys()
.cloned()
.collect::<HashSet<&str>>()
.difference(
&self
.rules
.iter()
.filter(|x| x.name.is_some())
.map(|x| &**x.name.as_ref().unwrap())
.collect::<HashSet<&str>>()
)
.cloned()
.collect::<HashSet<&str>>()
);
}
(missing_from_lexer, missing_from_parser)
}
pub fn iter_rules(&self) -> Iter<Rule<StorageT>> {
self.rules.iter()
}
pub fn lexer<'a>(&'a self, s: &'a str) -> impl Lexer<StorageT> + 'a {
LRLexer::new(self, s)
}
}
pub struct LRLexer<'a, StorageT> {
s: &'a str,
lexemes: Vec<Result<Lexeme<StorageT>, LexError>>,
newlines: Vec<usize>
}
impl<'a, StorageT: Copy + Eq + Hash + PrimInt + Unsigned> LRLexer<'a, StorageT> {
fn new(lexerdef: &'a LexerDef<StorageT>, s: &'a str) -> LRLexer<'a, StorageT> {
let mut lexemes = vec![];
let mut newlines = vec![];
let mut i = 0;
while i < s.len() {
let old_i = i;
let mut longest = 0;
let mut longest_ridx = 0;
for (ridx, r) in lexerdef.iter_rules().enumerate() {
if let Some(m) = r.re.find(&s[old_i..]) {
let len = m.end();
if len > longest {
longest = len;
longest_ridx = ridx;
}
}
}
if longest > 0 {
newlines.extend(
s[old_i..old_i + longest]
.chars()
.enumerate()
.filter(|&(_, c)| c == '\n')
.map(|(j, _)| old_i + j + 1)
);
let r = lexerdef.get_rule(longest_ridx).unwrap();
if r.name.is_some() {
match r.tok_id {
Some(tok_id) => {
lexemes.push(Ok(Lexeme::new(tok_id, old_i, Some(longest))));
}
None => {
lexemes.push(Err(LexError { idx: old_i }));
break;
}
}
}
i += longest;
} else {
lexemes.push(Err(LexError { idx: old_i }));
break;
}
}
LRLexer {
s,
lexemes,
newlines
}
}
}
impl<'a, StorageT: Copy + Eq + Hash + PrimInt + Unsigned> Lexer<StorageT>
for LRLexer<'a, StorageT>
{
fn iter<'b>(&'b self) -> Box<dyn Iterator<Item = Result<Lexeme<StorageT>, LexError>> + 'b> {
Box::new(self.lexemes.iter().cloned())
}
fn line_col(&self, i: usize) -> (usize, usize) {
if i > self.s.len() {
panic!("Offset {} exceeds known input length {}", i, self.s.len());
}
fn line_col_off<StorageT>(lexer: &LRLexer<StorageT>, i: usize) -> (usize, usize) {
if lexer.newlines.is_empty() || i < lexer.newlines[0] {
return (1, i);
}
for j in 0..lexer.newlines.len() - 1 {
if lexer.newlines[j + 1] > i {
return (j + 2, i - lexer.newlines[j]);
}
}
(
lexer.newlines.len() + 1,
i - lexer.newlines[lexer.newlines.len() - 1]
)
}
let (line_idx, col_byte) = line_col_off(self, i);
let line = self.surrounding_line_str(i);
(line_idx, line[..col_byte].chars().count() + 1)
}
fn surrounding_line_str(&self, i: usize) -> &str {
fn surrounding_line_off<StorageT>(lexer: &LRLexer<StorageT>, i: usize) -> (usize, usize) {
if i > lexer.s.len() {
panic!("Offset {} exceeds known input length {}", i, lexer.s.len());
}
if lexer.newlines.is_empty() {
return (0, lexer.s.len());
} else if i < lexer.newlines[0] {
return (0, lexer.newlines[0] - 1);
}
for j in 0..lexer.newlines.len() - 1 {
if lexer.newlines[j + 1] > i {
return (lexer.newlines[j], lexer.newlines[j + 1] - 1);
}
}
(lexer.newlines[lexer.newlines.len() - 1], lexer.s.len())
}
let (st, en) = surrounding_line_off(self, i);
&self.s[st..en]
}
fn lexeme_str(&self, l: &Lexeme<StorageT>) -> &str {
&self.s[l.start()..l.end()]
}
}
#[cfg(test)]
mod test {
use super::*;
use crate::parser::parse_lex;
use std::collections::HashMap;
#[test]
fn test_basic() {
let src = r"
%%
[0-9]+ 'int'
[a-zA-Z]+ 'id'
[ \t] ;
"
.to_string();
let mut lexer = parse_lex(&src).unwrap();
let mut map = HashMap::new();
map.insert("int", 0);
map.insert("id", 1);
assert_eq!(lexer.set_rule_ids(&map), (None, None));
let lexemes = lexer
.lexer(&"abc 123")
.iter()
.map(|x| x.unwrap())
.collect::<Vec<_>>();
assert_eq!(lexemes.len(), 2);
let lex1 = lexemes[0];
assert_eq!(lex1.tok_id(), 1u8);
assert_eq!(lex1.start(), 0);
assert_eq!(lex1.len(), 3);
let lex2 = lexemes[1];
assert_eq!(lex2.tok_id(), 0);
assert_eq!(lex2.start(), 4);
assert_eq!(lex2.len(), 3);
}
#[test]
fn test_basic_error() {
let src = "
%%
[0-9]+ 'int'
"
.to_string();
let lexer = parse_lex::<u8>(&src).unwrap();
match lexer.lexer(&"abc").iter().nth(0).unwrap() {
Ok(_) => panic!("Invalid input lexed"),
Err(LexError { idx: 0 }) => (),
Err(e) => panic!("Incorrect error returned {:?}", e)
};
}
#[test]
fn test_longest_match() {
let src = "%%
if 'IF'
[a-z]+ 'ID'
[ ] ;"
.to_string();
let mut lexer = parse_lex(&src).unwrap();
let mut map = HashMap::new();
map.insert("IF", 0);
map.insert("ID", 1);
assert_eq!(lexer.set_rule_ids(&map), (None, None));
let lexemes = lexer
.lexer(&"iff if")
.iter()
.map(|x| x.unwrap())
.collect::<Vec<_>>();
assert_eq!(lexemes.len(), 2);
let lex1 = lexemes[0];
assert_eq!(lex1.tok_id(), 1u8);
assert_eq!(lex1.start(), 0);
assert_eq!(lex1.len(), 3);
let lex2 = lexemes[1];
assert_eq!(lex2.tok_id(), 0);
assert_eq!(lex2.start(), 4);
assert_eq!(lex2.len(), 2);
}
#[test]
fn test_multibyte() {
let src = "%%
[a❤]+ 'ID'
[ ] ;"
.to_string();
let mut lexerdef = parse_lex(&src).unwrap();
let mut map = HashMap::new();
map.insert("ID", 0u8);
assert_eq!(lexerdef.set_rule_ids(&map), (None, None));
let lexer = lexerdef.lexer("a ❤ a");
let lexemes = lexer.iter().map(|x| x.unwrap()).collect::<Vec<_>>();
assert_eq!(lexemes.len(), 3);
let lex1 = lexemes[0];
assert_eq!(lex1.start(), 0);
assert_eq!(lex1.len(), 1);
assert_eq!(lexer.lexeme_str(&lex1), "a");
let lex2 = lexemes[1];
assert_eq!(lex2.start(), 2);
assert_eq!(lex2.len(), 3);
assert_eq!(lexer.lexeme_str(&lex2), "❤");
let lex3 = lexemes[2];
assert_eq!(lex3.start(), 6);
assert_eq!(lex3.len(), 1);
assert_eq!(lexer.lexeme_str(&lex3), "a");
}
#[test]
fn test_line_col() {
let src = "%%
[a-z]+ 'ID'
[ \\n] ;"
.to_string();
let mut lexerdef = parse_lex(&src).unwrap();
let mut map = HashMap::new();
map.insert("ID", 0u8);
assert_eq!(lexerdef.set_rule_ids(&map), (None, None));
let lexer = lexerdef.lexer("a b c");
let lexemes = lexer.iter().map(|x| x.unwrap()).collect::<Vec<_>>();
assert_eq!(lexemes.len(), 3);
assert_eq!(lexer.line_col(lexemes[1].start()), (1, 3));
assert_eq!(lexer.surrounding_line_str(lexemes[1].start()), "a b c");
let lexer = lexerdef.lexer("a b c\n");
let lexemes = lexer.iter().map(|x| x.unwrap()).collect::<Vec<_>>();
assert_eq!(lexemes.len(), 3);
assert_eq!(lexer.line_col(lexemes[1].start()), (1, 3));
assert_eq!(lexer.surrounding_line_str(lexemes[1].start()), "a b c");
let lexer = lexerdef.lexer(" a\nb\n c d");
let lexemes = lexer.iter().map(|x| x.unwrap()).collect::<Vec<_>>();
assert_eq!(lexemes.len(), 4);
assert_eq!(lexer.line_col(lexemes[0].start()), (1, 2));
assert_eq!(lexer.line_col(lexemes[1].start()), (2, 1));
assert_eq!(lexer.line_col(lexemes[2].start()), (3, 3));
assert_eq!(lexer.line_col(lexemes[3].start()), (3, 5));
assert_eq!(lexer.surrounding_line_str(lexemes[0].start()), " a");
assert_eq!(lexer.surrounding_line_str(lexemes[1].start()), "b");
assert_eq!(lexer.surrounding_line_str(lexemes[2].start()), " c d");
assert_eq!(lexer.surrounding_line_str(lexemes[3].start()), " c d");
}
#[test]
fn test_line_col_multibyte() {
let src = "%%
[a-z❤]+ 'ID'
[ \\n] ;"
.to_string();
let mut lexerdef = parse_lex(&src).unwrap();
let mut map = HashMap::new();
map.insert("ID", 0u8);
assert_eq!(lexerdef.set_rule_ids(&map), (None, None));
let lexer = lexerdef.lexer(" a\n❤ b");
let lexemes = lexer.iter().map(|x| x.unwrap()).collect::<Vec<_>>();
assert_eq!(lexemes.len(), 3);
assert_eq!(lexer.line_col(lexemes[0].start()), (1, 2));
assert_eq!(lexer.line_col(lexemes[1].start()), (2, 1));
assert_eq!(lexer.line_col(lexemes[2].start()), (2, 3));
assert_eq!(lexer.surrounding_line_str(lexemes[0].start()), " a");
assert_eq!(lexer.surrounding_line_str(lexemes[1].start()), "❤ b");
assert_eq!(lexer.surrounding_line_str(lexemes[2].start()), "❤ b");
}
#[test]
#[should_panic]
fn test_bad_line_col() {
let src = "%%
[a-z]+ 'ID'
[ \\n] ;"
.to_string();
let mut lexerdef = parse_lex(&src).unwrap();
let mut map = HashMap::new();
map.insert("ID", 0u8);
assert_eq!(lexerdef.set_rule_ids(&map), (None, None));
let lexer = lexerdef.lexer("a b c");
lexer.line_col(100);
}
#[test]
fn test_missing_from_lexer_and_parser() {
let src = "%%
[a-z]+ 'ID'
[ \\n] ;"
.to_string();
let mut lexerdef = parse_lex(&src).unwrap();
let mut map = HashMap::new();
map.insert("INT", 0u8);
let mut missing_from_lexer = HashSet::new();
missing_from_lexer.insert("INT");
let mut missing_from_parser = HashSet::new();
missing_from_parser.insert("ID");
assert_eq!(
lexerdef.set_rule_ids(&map),
(Some(missing_from_lexer), Some(missing_from_parser))
);
match lexerdef.lexer(&" a ").iter().nth(0).unwrap() {
Ok(_) => panic!("Invalid input lexed"),
Err(LexError { idx: 1 }) => (),
Err(e) => panic!("Incorrect error returned {:?}", e)
};
}
}